]> git.ipfire.org Git - thirdparty/linux.git/blame - net/core/sock.c
Merge tag 'powerpc-6.7-4' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc...
[thirdparty/linux.git] / net / core / sock.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
1da177e4
LT
2/*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Generic socket support routines. Memory allocators, socket lock/release
8 * handler for protocols to use and generic option handler.
9 *
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
4ec93edb 35 * code. The ACK stuff can wait and needs major
1da177e4
LT
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
1da177e4
LT
84 */
85
e005d193
JP
86#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87
80b14dee 88#include <asm/unaligned.h>
4fc268d2 89#include <linux/capability.h>
1da177e4 90#include <linux/errno.h>
cb820f8e 91#include <linux/errqueue.h>
1da177e4
LT
92#include <linux/types.h>
93#include <linux/socket.h>
94#include <linux/in.h>
95#include <linux/kernel.h>
1da177e4
LT
96#include <linux/module.h>
97#include <linux/proc_fs.h>
98#include <linux/seq_file.h>
99#include <linux/sched.h>
f1083048 100#include <linux/sched/mm.h>
1da177e4
LT
101#include <linux/timer.h>
102#include <linux/string.h>
103#include <linux/sockios.h>
104#include <linux/net.h>
105#include <linux/mm.h>
106#include <linux/slab.h>
107#include <linux/interrupt.h>
108#include <linux/poll.h>
109#include <linux/tcp.h>
110#include <linux/init.h>
a1f8e7f7 111#include <linux/highmem.h>
3f551f94 112#include <linux/user_namespace.h>
c5905afb 113#include <linux/static_key.h>
3969eb38 114#include <linux/memcontrol.h>
8c1ae10d 115#include <linux/prefetch.h>
a6c0d093 116#include <linux/compat.h>
e1d001fa
BL
117#include <linux/mroute.h>
118#include <linux/mroute6.h>
119#include <linux/icmpv6.h>
1da177e4 120
7c0f6ba6 121#include <linux/uaccess.h>
1da177e4
LT
122
123#include <linux/netdevice.h>
124#include <net/protocol.h>
125#include <linux/skbuff.h>
457c4cbc 126#include <net/net_namespace.h>
2e6599cb 127#include <net/request_sock.h>
1da177e4 128#include <net/sock.h>
20d49473 129#include <linux/net_tstamp.h>
1da177e4
LT
130#include <net/xfrm.h>
131#include <linux/ipsec.h>
f8451725 132#include <net/cls_cgroup.h>
5bc1421e 133#include <net/netprio_cgroup.h>
eb4cb008 134#include <linux/sock_diag.h>
1da177e4
LT
135
136#include <linux/filter.h>
538950a1 137#include <net/sock_reuseport.h>
6ac99e8f 138#include <net/bpf_sk_storage.h>
1da177e4 139
3847ce32
SM
140#include <trace/events/sock.h>
141
1da177e4 142#include <net/tcp.h>
076bb0c8 143#include <net/busy_poll.h>
e1d001fa 144#include <net/phonet/phonet.h>
06021292 145
d463126e
YL
146#include <linux/ethtool.h>
147
6264f58c
JK
148#include "dev.h"
149
36b77a52 150static DEFINE_MUTEX(proto_list_mutex);
d1a4c0b3
GC
151static LIST_HEAD(proto_list);
152
0a8afd9f 153static void sock_def_write_space_wfree(struct sock *sk);
052ada09
PB
154static void sock_def_write_space(struct sock *sk);
155
a3b299da
EB
156/**
157 * sk_ns_capable - General socket capability test
158 * @sk: Socket to use a capability on or through
159 * @user_ns: The user namespace of the capability to use
160 * @cap: The capability to use
161 *
162 * Test to see if the opener of the socket had when the socket was
163 * created and the current process has the capability @cap in the user
164 * namespace @user_ns.
165 */
166bool sk_ns_capable(const struct sock *sk,
167 struct user_namespace *user_ns, int cap)
168{
169 return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
170 ns_capable(user_ns, cap);
171}
172EXPORT_SYMBOL(sk_ns_capable);
173
174/**
175 * sk_capable - Socket global capability test
176 * @sk: Socket to use a capability on or through
e793c0f7 177 * @cap: The global capability to use
a3b299da
EB
178 *
179 * Test to see if the opener of the socket had when the socket was
180 * created and the current process has the capability @cap in all user
181 * namespaces.
182 */
183bool sk_capable(const struct sock *sk, int cap)
184{
185 return sk_ns_capable(sk, &init_user_ns, cap);
186}
187EXPORT_SYMBOL(sk_capable);
188
189/**
190 * sk_net_capable - Network namespace socket capability test
191 * @sk: Socket to use a capability on or through
192 * @cap: The capability to use
193 *
e793c0f7 194 * Test to see if the opener of the socket had when the socket was created
a3b299da
EB
195 * and the current process has the capability @cap over the network namespace
196 * the socket is a member of.
197 */
198bool sk_net_capable(const struct sock *sk, int cap)
199{
200 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
201}
202EXPORT_SYMBOL(sk_net_capable);
203
da21f24d
IM
204/*
205 * Each address family might have different locking rules, so we have
cdfbabfb
DH
206 * one slock key per address family and separate keys for internal and
207 * userspace sockets.
da21f24d 208 */
a5b5bb9a 209static struct lock_class_key af_family_keys[AF_MAX];
cdfbabfb 210static struct lock_class_key af_family_kern_keys[AF_MAX];
a5b5bb9a 211static struct lock_class_key af_family_slock_keys[AF_MAX];
cdfbabfb 212static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
a5b5bb9a 213
a5b5bb9a
IM
214/*
215 * Make lock validator output more readable. (we pre-construct these
216 * strings build-time, so that runtime initialization of socket
217 * locks is fast):
218 */
cdfbabfb
DH
219
220#define _sock_locks(x) \
221 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
222 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
223 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
224 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
225 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
226 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
227 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
228 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
229 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
230 x "27" , x "28" , x "AF_CAN" , \
231 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
232 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
233 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
234 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
68e8b849 235 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
bc49d816 236 x "AF_MCTP" , \
68e8b849 237 x "AF_MAX"
cdfbabfb 238
36cbd3dc 239static const char *const af_family_key_strings[AF_MAX+1] = {
cdfbabfb 240 _sock_locks("sk_lock-")
a5b5bb9a 241};
36cbd3dc 242static const char *const af_family_slock_key_strings[AF_MAX+1] = {
cdfbabfb 243 _sock_locks("slock-")
a5b5bb9a 244};
36cbd3dc 245static const char *const af_family_clock_key_strings[AF_MAX+1] = {
cdfbabfb
DH
246 _sock_locks("clock-")
247};
248
249static const char *const af_family_kern_key_strings[AF_MAX+1] = {
250 _sock_locks("k-sk_lock-")
251};
252static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
253 _sock_locks("k-slock-")
254};
255static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
256 _sock_locks("k-clock-")
443aef0e 257};
581319c5 258static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
6b431d50 259 _sock_locks("rlock-")
581319c5
PA
260};
261static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
6b431d50 262 _sock_locks("wlock-")
581319c5
PA
263};
264static const char *const af_family_elock_key_strings[AF_MAX+1] = {
6b431d50 265 _sock_locks("elock-")
581319c5 266};
da21f24d
IM
267
268/*
581319c5 269 * sk_callback_lock and sk queues locking rules are per-address-family,
da21f24d
IM
270 * so split the lock classes by using a per-AF key:
271 */
272static struct lock_class_key af_callback_keys[AF_MAX];
581319c5
PA
273static struct lock_class_key af_rlock_keys[AF_MAX];
274static struct lock_class_key af_wlock_keys[AF_MAX];
275static struct lock_class_key af_elock_keys[AF_MAX];
cdfbabfb 276static struct lock_class_key af_kern_callback_keys[AF_MAX];
da21f24d 277
1da177e4 278/* Run time adjustable parameters. */
ab32ea5d 279__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
6d8ebc8a 280EXPORT_SYMBOL(sysctl_wmem_max);
ab32ea5d 281__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
6d8ebc8a 282EXPORT_SYMBOL(sysctl_rmem_max);
ab32ea5d
BH
283__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
284__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
1da177e4 285
25985edc 286/* Maximal space eaten by iovec or ancillary data plus some space */
ab32ea5d 287int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
2a91525c 288EXPORT_SYMBOL(sysctl_optmem_max);
1da177e4 289
b245be1f
WB
290int sysctl_tstamp_allow_data __read_mostly = 1;
291
a7950ae8
DB
292DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
293EXPORT_SYMBOL_GPL(memalloc_socks_key);
c93bdd0e 294
7cb02404
MG
295/**
296 * sk_set_memalloc - sets %SOCK_MEMALLOC
297 * @sk: socket to set it on
298 *
299 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
300 * It's the responsibility of the admin to adjust min_free_kbytes
301 * to meet the requirements
302 */
303void sk_set_memalloc(struct sock *sk)
304{
305 sock_set_flag(sk, SOCK_MEMALLOC);
306 sk->sk_allocation |= __GFP_MEMALLOC;
a7950ae8 307 static_branch_inc(&memalloc_socks_key);
7cb02404
MG
308}
309EXPORT_SYMBOL_GPL(sk_set_memalloc);
310
311void sk_clear_memalloc(struct sock *sk)
312{
313 sock_reset_flag(sk, SOCK_MEMALLOC);
314 sk->sk_allocation &= ~__GFP_MEMALLOC;
a7950ae8 315 static_branch_dec(&memalloc_socks_key);
c76562b6
MG
316
317 /*
318 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
5d753610
MG
319 * progress of swapping. SOCK_MEMALLOC may be cleared while
320 * it has rmem allocations due to the last swapfile being deactivated
321 * but there is a risk that the socket is unusable due to exceeding
322 * the rmem limits. Reclaim the reserves and obey rmem limits again.
c76562b6 323 */
5d753610 324 sk_mem_reclaim(sk);
7cb02404
MG
325}
326EXPORT_SYMBOL_GPL(sk_clear_memalloc);
327
b4b9e355
MG
328int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
329{
330 int ret;
f1083048 331 unsigned int noreclaim_flag;
b4b9e355
MG
332
333 /* these should have been dropped before queueing */
334 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
335
f1083048 336 noreclaim_flag = memalloc_noreclaim_save();
d2489c7b
ED
337 ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
338 tcp_v6_do_rcv,
339 tcp_v4_do_rcv,
340 sk, skb);
f1083048 341 memalloc_noreclaim_restore(noreclaim_flag);
b4b9e355
MG
342
343 return ret;
344}
345EXPORT_SYMBOL(__sk_backlog_rcv);
346
e3ae2365
AA
347void sk_error_report(struct sock *sk)
348{
349 sk->sk_error_report(sk);
e6a3e443
AA
350
351 switch (sk->sk_family) {
352 case AF_INET:
353 fallthrough;
354 case AF_INET6:
355 trace_inet_sk_error_report(sk);
356 break;
357 default:
358 break;
359 }
e3ae2365
AA
360}
361EXPORT_SYMBOL(sk_error_report);
362
4c1e34c0 363int sock_get_timeout(long timeo, void *optval, bool old_timeval)
fe0c72f3 364{
a9beb86a 365 struct __kernel_sock_timeval tv;
fe0c72f3
AB
366
367 if (timeo == MAX_SCHEDULE_TIMEOUT) {
368 tv.tv_sec = 0;
369 tv.tv_usec = 0;
370 } else {
371 tv.tv_sec = timeo / HZ;
372 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
373 }
374
e6986423 375 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
fe0c72f3
AB
376 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
377 *(struct old_timeval32 *)optval = tv32;
378 return sizeof(tv32);
379 }
380
a9beb86a
DD
381 if (old_timeval) {
382 struct __kernel_old_timeval old_tv;
383 old_tv.tv_sec = tv.tv_sec;
384 old_tv.tv_usec = tv.tv_usec;
385 *(struct __kernel_old_timeval *)optval = old_tv;
28e72b26 386 return sizeof(old_tv);
a9beb86a
DD
387 }
388
28e72b26
VC
389 *(struct __kernel_sock_timeval *)optval = tv;
390 return sizeof(tv);
fe0c72f3 391}
4c1e34c0 392EXPORT_SYMBOL(sock_get_timeout);
fe0c72f3 393
4c1e34c0
RP
394int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
395 sockptr_t optval, int optlen, bool old_timeval)
1da177e4 396{
e6986423 397 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
fe0c72f3
AB
398 struct old_timeval32 tv32;
399
400 if (optlen < sizeof(tv32))
401 return -EINVAL;
402
c34645ac 403 if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
fe0c72f3 404 return -EFAULT;
4c1e34c0
RP
405 tv->tv_sec = tv32.tv_sec;
406 tv->tv_usec = tv32.tv_usec;
a9beb86a
DD
407 } else if (old_timeval) {
408 struct __kernel_old_timeval old_tv;
409
410 if (optlen < sizeof(old_tv))
411 return -EINVAL;
c34645ac 412 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
a9beb86a 413 return -EFAULT;
4c1e34c0
RP
414 tv->tv_sec = old_tv.tv_sec;
415 tv->tv_usec = old_tv.tv_usec;
fe0c72f3 416 } else {
4c1e34c0 417 if (optlen < sizeof(*tv))
fe0c72f3 418 return -EINVAL;
4c1e34c0 419 if (copy_from_sockptr(tv, optval, sizeof(*tv)))
fe0c72f3
AB
420 return -EFAULT;
421 }
4c1e34c0
RP
422
423 return 0;
424}
425EXPORT_SYMBOL(sock_copy_user_timeval);
426
427static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
428 bool old_timeval)
429{
430 struct __kernel_sock_timeval tv;
431 int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
285975dd 432 long val;
4c1e34c0
RP
433
434 if (err)
435 return err;
436
ba78073e
VA
437 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
438 return -EDOM;
1da177e4 439
ba78073e 440 if (tv.tv_sec < 0) {
6f11df83
AM
441 static int warned __read_mostly;
442
285975dd 443 WRITE_ONCE(*timeo_p, 0);
50aab54f 444 if (warned < 10 && net_ratelimit()) {
ba78073e 445 warned++;
e005d193
JP
446 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
447 __func__, current->comm, task_pid_nr(current));
50aab54f 448 }
ba78073e
VA
449 return 0;
450 }
285975dd
ED
451 val = MAX_SCHEDULE_TIMEOUT;
452 if ((tv.tv_sec || tv.tv_usec) &&
453 (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
454 val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
455 USEC_PER_SEC / HZ);
456 WRITE_ONCE(*timeo_p, val);
1da177e4
LT
457 return 0;
458}
459
080a270f
HFS
460static bool sock_needs_netstamp(const struct sock *sk)
461{
462 switch (sk->sk_family) {
463 case AF_UNSPEC:
464 case AF_UNIX:
465 return false;
466 default:
467 return true;
468 }
469}
470
08e29af3 471static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
4ec93edb 472{
08e29af3
ED
473 if (sk->sk_flags & flags) {
474 sk->sk_flags &= ~flags;
080a270f
HFS
475 if (sock_needs_netstamp(sk) &&
476 !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
20d49473 477 net_disable_timestamp();
1da177e4
LT
478 }
479}
480
481
e6afc8ac 482int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
f0088a50 483{
3b885787
NH
484 unsigned long flags;
485 struct sk_buff_head *list = &sk->sk_receive_queue;
f0088a50 486
0fd7bac6 487 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
766e9037 488 atomic_inc(&sk->sk_drops);
3847ce32 489 trace_sock_rcvqueue_full(sk, skb);
766e9037 490 return -ENOMEM;
f0088a50
DV
491 }
492
c76562b6 493 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
766e9037
ED
494 atomic_inc(&sk->sk_drops);
495 return -ENOBUFS;
3ab224be
HA
496 }
497
f0088a50
DV
498 skb->dev = NULL;
499 skb_set_owner_r(skb, sk);
49ad9599 500
7fee226a
ED
501 /* we escape from rcu protected region, make sure we dont leak
502 * a norefcounted dst
503 */
504 skb_dst_force(skb);
505
3b885787 506 spin_lock_irqsave(&list->lock, flags);
3bc3b96f 507 sock_skb_set_dropcount(sk, skb);
3b885787
NH
508 __skb_queue_tail(list, skb);
509 spin_unlock_irqrestore(&list->lock, flags);
f0088a50
DV
510
511 if (!sock_flag(sk, SOCK_DEAD))
676d2369 512 sk->sk_data_ready(sk);
766e9037 513 return 0;
f0088a50 514}
e6afc8ac 515EXPORT_SYMBOL(__sock_queue_rcv_skb);
516
c1b8a567
MD
517int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
518 enum skb_drop_reason *reason)
e6afc8ac 519{
c1b8a567 520 enum skb_drop_reason drop_reason;
e6afc8ac 521 int err;
522
523 err = sk_filter(sk, skb);
c1b8a567
MD
524 if (err) {
525 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
526 goto out;
527 }
528 err = __sock_queue_rcv_skb(sk, skb);
529 switch (err) {
530 case -ENOMEM:
531 drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
532 break;
533 case -ENOBUFS:
534 drop_reason = SKB_DROP_REASON_PROTO_MEM;
535 break;
536 default:
537 drop_reason = SKB_NOT_DROPPED_YET;
538 break;
539 }
540out:
541 if (reason)
542 *reason = drop_reason;
543 return err;
e6afc8ac 544}
c1b8a567 545EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
f0088a50 546
4f0c40d9 547int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
c3f24cfb 548 const int nested, unsigned int trim_cap, bool refcounted)
f0088a50
DV
549{
550 int rc = NET_RX_SUCCESS;
551
4f0c40d9 552 if (sk_filter_trim_cap(sk, skb, trim_cap))
f0088a50
DV
553 goto discard_and_relse;
554
555 skb->dev = NULL;
556
274f482d 557 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
c377411f
ED
558 atomic_inc(&sk->sk_drops);
559 goto discard_and_relse;
560 }
58a5a7b9
ACM
561 if (nested)
562 bh_lock_sock_nested(sk);
563 else
564 bh_lock_sock(sk);
a5b5bb9a
IM
565 if (!sock_owned_by_user(sk)) {
566 /*
567 * trylock + unlock semantics:
568 */
569 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
570
c57943a1 571 rc = sk_backlog_rcv(sk, skb);
a5b5bb9a 572
5facae4f 573 mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
8265792b 574 } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
8eae939f
ZY
575 bh_unlock_sock(sk);
576 atomic_inc(&sk->sk_drops);
577 goto discard_and_relse;
578 }
579
f0088a50
DV
580 bh_unlock_sock(sk);
581out:
c3f24cfb
ED
582 if (refcounted)
583 sock_put(sk);
f0088a50
DV
584 return rc;
585discard_and_relse:
586 kfree_skb(skb);
587 goto out;
588}
4f0c40d9 589EXPORT_SYMBOL(__sk_receive_skb);
f0088a50 590
bbd807df
BV
591INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
592 u32));
593INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
594 u32));
f0088a50
DV
595struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
596{
b6c6712a 597 struct dst_entry *dst = __sk_dst_get(sk);
f0088a50 598
bbd807df
BV
599 if (dst && dst->obsolete &&
600 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
601 dst, cookie) == NULL) {
e022f0b4 602 sk_tx_queue_clear(sk);
eb44ad4e 603 WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
a9b3cd7f 604 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
f0088a50
DV
605 dst_release(dst);
606 return NULL;
607 }
608
609 return dst;
610}
611EXPORT_SYMBOL(__sk_dst_check);
612
613struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
614{
615 struct dst_entry *dst = sk_dst_get(sk);
616
bbd807df
BV
617 if (dst && dst->obsolete &&
618 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
619 dst, cookie) == NULL) {
f0088a50
DV
620 sk_dst_reset(sk);
621 dst_release(dst);
622 return NULL;
623 }
624
625 return dst;
626}
627EXPORT_SYMBOL(sk_dst_check);
628
7594888c 629static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
4878809f
DM
630{
631 int ret = -ENOPROTOOPT;
632#ifdef CONFIG_NETDEVICES
3b1e0a65 633 struct net *net = sock_net(sk);
4878809f
DM
634
635 /* Sorry... */
636 ret = -EPERM;
c427bfec 637 if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
4878809f
DM
638 goto out;
639
f5dd3d0c
DR
640 ret = -EINVAL;
641 if (ifindex < 0)
642 goto out;
643
e5fccaa1
ED
644 /* Paired with all READ_ONCE() done locklessly. */
645 WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
646
f5dd3d0c
DR
647 if (sk->sk_prot->rehash)
648 sk->sk_prot->rehash(sk);
649 sk_dst_reset(sk);
650
651 ret = 0;
652
653out:
654#endif
655
656 return ret;
657}
658
8ea204c2 659int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
7594888c
CH
660{
661 int ret;
662
8ea204c2
FF
663 if (lock_sk)
664 lock_sock(sk);
7594888c 665 ret = sock_bindtoindex_locked(sk, ifindex);
8ea204c2
FF
666 if (lock_sk)
667 release_sock(sk);
7594888c
CH
668
669 return ret;
670}
671EXPORT_SYMBOL(sock_bindtoindex);
672
5790642b 673static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
f5dd3d0c
DR
674{
675 int ret = -ENOPROTOOPT;
676#ifdef CONFIG_NETDEVICES
677 struct net *net = sock_net(sk);
678 char devname[IFNAMSIZ];
679 int index;
680
4878809f
DM
681 ret = -EINVAL;
682 if (optlen < 0)
683 goto out;
684
685 /* Bind this socket to a particular device like "eth0",
686 * as specified in the passed interface name. If the
687 * name is "" or the option length is zero the socket
688 * is not bound.
689 */
690 if (optlen > IFNAMSIZ - 1)
691 optlen = IFNAMSIZ - 1;
692 memset(devname, 0, sizeof(devname));
693
694 ret = -EFAULT;
5790642b 695 if (copy_from_sockptr(devname, optval, optlen))
4878809f
DM
696 goto out;
697
000ba2e4
DM
698 index = 0;
699 if (devname[0] != '\0') {
bf8e56bf 700 struct net_device *dev;
4878809f 701
bf8e56bf
ED
702 rcu_read_lock();
703 dev = dev_get_by_name_rcu(net, devname);
704 if (dev)
705 index = dev->ifindex;
706 rcu_read_unlock();
4878809f
DM
707 ret = -ENODEV;
708 if (!dev)
709 goto out;
4878809f
DM
710 }
711
24426654
MKL
712 sockopt_lock_sock(sk);
713 ret = sock_bindtoindex_locked(sk, index);
714 sockopt_release_sock(sk);
4878809f
DM
715out:
716#endif
717
718 return ret;
719}
720
4ff09db1
MKL
721static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
722 sockptr_t optlen, int len)
c91f6df2
BH
723{
724 int ret = -ENOPROTOOPT;
725#ifdef CONFIG_NETDEVICES
e5fccaa1 726 int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
c91f6df2 727 struct net *net = sock_net(sk);
c91f6df2 728 char devname[IFNAMSIZ];
c91f6df2 729
e5fccaa1 730 if (bound_dev_if == 0) {
c91f6df2
BH
731 len = 0;
732 goto zero;
733 }
734
735 ret = -EINVAL;
736 if (len < IFNAMSIZ)
737 goto out;
738
e5fccaa1 739 ret = netdev_get_name(net, devname, bound_dev_if);
5dbe7c17 740 if (ret)
c91f6df2 741 goto out;
c91f6df2
BH
742
743 len = strlen(devname) + 1;
744
745 ret = -EFAULT;
4ff09db1 746 if (copy_to_sockptr(optval, devname, len))
c91f6df2
BH
747 goto out;
748
749zero:
750 ret = -EFAULT;
4ff09db1 751 if (copy_to_sockptr(optlen, &len, sizeof(int)))
c91f6df2
BH
752 goto out;
753
754 ret = 0;
755
756out:
757#endif
758
759 return ret;
760}
761
d986f521 762bool sk_mc_loop(const struct sock *sk)
f60e5990 763{
764 if (dev_recursion_level())
765 return false;
766 if (!sk)
767 return true;
a3e0fdf7
ED
768 /* IPV6_ADDRFORM can change sk->sk_family under us. */
769 switch (READ_ONCE(sk->sk_family)) {
f60e5990 770 case AF_INET:
b09bde5c 771 return inet_test_bit(MC_LOOP, sk);
f60e5990 772#if IS_ENABLED(CONFIG_IPV6)
773 case AF_INET6:
d986f521 774 return inet6_test_bit(MC6_LOOP, sk);
f60e5990 775#endif
776 }
0ad6f6e7 777 WARN_ON_ONCE(1);
f60e5990 778 return true;
779}
780EXPORT_SYMBOL(sk_mc_loop);
781
b58f0e8f
CH
782void sock_set_reuseaddr(struct sock *sk)
783{
784 lock_sock(sk);
785 sk->sk_reuse = SK_CAN_REUSE;
786 release_sock(sk);
787}
788EXPORT_SYMBOL(sock_set_reuseaddr);
789
fe31a326
CH
790void sock_set_reuseport(struct sock *sk)
791{
792 lock_sock(sk);
793 sk->sk_reuseport = true;
794 release_sock(sk);
795}
796EXPORT_SYMBOL(sock_set_reuseport);
797
c433594c
CH
798void sock_no_linger(struct sock *sk)
799{
800 lock_sock(sk);
bc1fb82a 801 WRITE_ONCE(sk->sk_lingertime, 0);
c433594c
CH
802 sock_set_flag(sk, SOCK_LINGER);
803 release_sock(sk);
804}
805EXPORT_SYMBOL(sock_no_linger);
806
6e434967
CH
807void sock_set_priority(struct sock *sk, u32 priority)
808{
8bf43be7 809 WRITE_ONCE(sk->sk_priority, priority);
6e434967
CH
810}
811EXPORT_SYMBOL(sock_set_priority);
812
76ee0785
CH
813void sock_set_sndtimeo(struct sock *sk, s64 secs)
814{
815 lock_sock(sk);
816 if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
285975dd 817 WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
76ee0785 818 else
285975dd 819 WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
76ee0785
CH
820 release_sock(sk);
821}
822EXPORT_SYMBOL(sock_set_sndtimeo);
823
783da70e
CH
824static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
825{
826 if (val) {
827 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
828 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
829 sock_set_flag(sk, SOCK_RCVTSTAMP);
830 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
831 } else {
832 sock_reset_flag(sk, SOCK_RCVTSTAMP);
833 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
783da70e
CH
834 }
835}
836
837void sock_enable_timestamps(struct sock *sk)
838{
839 lock_sock(sk);
840 __sock_set_timestamps(sk, true, false, true);
841 release_sock(sk);
842}
843EXPORT_SYMBOL(sock_enable_timestamps);
844
371087aa
FW
845void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
846{
847 switch (optname) {
848 case SO_TIMESTAMP_OLD:
849 __sock_set_timestamps(sk, valbool, false, false);
850 break;
851 case SO_TIMESTAMP_NEW:
852 __sock_set_timestamps(sk, valbool, true, false);
853 break;
854 case SO_TIMESTAMPNS_OLD:
855 __sock_set_timestamps(sk, valbool, false, true);
856 break;
857 case SO_TIMESTAMPNS_NEW:
858 __sock_set_timestamps(sk, valbool, true, true);
859 break;
860 }
861}
862
d463126e
YL
863static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
864{
865 struct net *net = sock_net(sk);
866 struct net_device *dev = NULL;
867 bool match = false;
868 int *vclock_index;
869 int i, num;
870
871 if (sk->sk_bound_dev_if)
872 dev = dev_get_by_index(net, sk->sk_bound_dev_if);
873
874 if (!dev) {
875 pr_err("%s: sock not bind to device\n", __func__);
876 return -EOPNOTSUPP;
877 }
878
879 num = ethtool_get_phc_vclocks(dev, &vclock_index);
2a4d75bf
ML
880 dev_put(dev);
881
d463126e
YL
882 for (i = 0; i < num; i++) {
883 if (*(vclock_index + i) == phc_index) {
884 match = true;
885 break;
886 }
887 }
888
889 if (num > 0)
890 kfree(vclock_index);
891
892 if (!match)
893 return -EINVAL;
894
251cd405 895 WRITE_ONCE(sk->sk_bind_phc, phc_index);
d463126e
YL
896
897 return 0;
898}
899
900int sock_set_timestamping(struct sock *sk, int optname,
901 struct so_timestamping timestamping)
ced122d9 902{
d463126e
YL
903 int val = timestamping.flags;
904 int ret;
905
ced122d9
FW
906 if (val & ~SOF_TIMESTAMPING_MASK)
907 return -EINVAL;
908
b534dc46
WB
909 if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
910 !(val & SOF_TIMESTAMPING_OPT_ID))
911 return -EINVAL;
912
ced122d9
FW
913 if (val & SOF_TIMESTAMPING_OPT_ID &&
914 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
42f67eea 915 if (sk_is_tcp(sk)) {
ced122d9
FW
916 if ((1 << sk->sk_state) &
917 (TCPF_CLOSE | TCPF_LISTEN))
918 return -EINVAL;
b534dc46
WB
919 if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
920 atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
921 else
922 atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
ced122d9 923 } else {
a1cdec57 924 atomic_set(&sk->sk_tskey, 0);
ced122d9
FW
925 }
926 }
927
928 if (val & SOF_TIMESTAMPING_OPT_STATS &&
929 !(val & SOF_TIMESTAMPING_OPT_TSONLY))
930 return -EINVAL;
931
d463126e
YL
932 if (val & SOF_TIMESTAMPING_BIND_PHC) {
933 ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
934 if (ret)
935 return ret;
936 }
937
e3390b30 938 WRITE_ONCE(sk->sk_tsflags, val);
ced122d9
FW
939 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
940
941 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
942 sock_enable_timestamp(sk,
943 SOCK_TIMESTAMPING_RX_SOFTWARE);
944 else
945 sock_disable_timestamp(sk,
946 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
947 return 0;
948}
949
ce3d9544
CH
950void sock_set_keepalive(struct sock *sk)
951{
952 lock_sock(sk);
953 if (sk->sk_prot->keepalive)
954 sk->sk_prot->keepalive(sk, true);
955 sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
956 release_sock(sk);
957}
958EXPORT_SYMBOL(sock_set_keepalive);
959
26cfabf9
CH
960static void __sock_set_rcvbuf(struct sock *sk, int val)
961{
962 /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
963 * as a negative value.
964 */
965 val = min_t(int, val, INT_MAX / 2);
966 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
967
968 /* We double it on the way in to account for "struct sk_buff" etc.
969 * overhead. Applications assume that the SO_RCVBUF setting they make
970 * will allow that much actual data to be received on that socket.
971 *
972 * Applications are unaware that "struct sk_buff" and other overheads
973 * allocate from the receive buffer during socket buffer allocation.
974 *
975 * And after considering the possible alternatives, returning the value
976 * we actually used in getsockopt is the most desirable behavior.
977 */
978 WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
979}
980
981void sock_set_rcvbuf(struct sock *sk, int val)
982{
983 lock_sock(sk);
984 __sock_set_rcvbuf(sk, val);
985 release_sock(sk);
986}
987EXPORT_SYMBOL(sock_set_rcvbuf);
988
dd9082f4
AA
989static void __sock_set_mark(struct sock *sk, u32 val)
990{
991 if (val != sk->sk_mark) {
3c5b4d69 992 WRITE_ONCE(sk->sk_mark, val);
dd9082f4
AA
993 sk_dst_reset(sk);
994 }
995}
996
84d1c617
AA
997void sock_set_mark(struct sock *sk, u32 val)
998{
999 lock_sock(sk);
dd9082f4 1000 __sock_set_mark(sk, val);
84d1c617
AA
1001 release_sock(sk);
1002}
1003EXPORT_SYMBOL(sock_set_mark);
1004
2bb2f5fb
WW
1005static void sock_release_reserved_memory(struct sock *sk, int bytes)
1006{
1007 /* Round down bytes to multiple of pages */
100fdd1f 1008 bytes = round_down(bytes, PAGE_SIZE);
2bb2f5fb
WW
1009
1010 WARN_ON(bytes > sk->sk_reserved_mem);
fe11fdcb 1011 WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
2bb2f5fb
WW
1012 sk_mem_reclaim(sk);
1013}
1014
1015static int sock_reserve_memory(struct sock *sk, int bytes)
1016{
1017 long allocated;
1018 bool charged;
1019 int pages;
1020
d00c8ee3 1021 if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
2bb2f5fb
WW
1022 return -EOPNOTSUPP;
1023
1024 if (!bytes)
1025 return 0;
1026
1027 pages = sk_mem_pages(bytes);
1028
1029 /* pre-charge to memcg */
1030 charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1031 GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1032 if (!charged)
1033 return -ENOMEM;
1034
1035 /* pre-charge to forward_alloc */
219160be
ED
1036 sk_memory_allocated_add(sk, pages);
1037 allocated = sk_memory_allocated(sk);
2bb2f5fb
WW
1038 /* If the system goes into memory pressure with this
1039 * precharge, give up and return error.
1040 */
1041 if (allocated > sk_prot_mem_limits(sk, 1)) {
1042 sk_memory_allocated_sub(sk, pages);
1043 mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1044 return -ENOMEM;
1045 }
5e6300e7 1046 sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
2bb2f5fb 1047
fe11fdcb
ED
1048 WRITE_ONCE(sk->sk_reserved_mem,
1049 sk->sk_reserved_mem + (pages << PAGE_SHIFT));
2bb2f5fb
WW
1050
1051 return 0;
1052}
1053
24426654
MKL
1054void sockopt_lock_sock(struct sock *sk)
1055{
1056 /* When current->bpf_ctx is set, the setsockopt is called from
1057 * a bpf prog. bpf has ensured the sk lock has been
1058 * acquired before calling setsockopt().
1059 */
1060 if (has_current_bpf_ctx())
1061 return;
1062
1063 lock_sock(sk);
1064}
1065EXPORT_SYMBOL(sockopt_lock_sock);
1066
1067void sockopt_release_sock(struct sock *sk)
1068{
1069 if (has_current_bpf_ctx())
1070 return;
1071
1072 release_sock(sk);
1073}
1074EXPORT_SYMBOL(sockopt_release_sock);
1075
e42c7bee
MKL
1076bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1077{
1078 return has_current_bpf_ctx() || ns_capable(ns, cap);
1079}
1080EXPORT_SYMBOL(sockopt_ns_capable);
1081
1082bool sockopt_capable(int cap)
1083{
1084 return has_current_bpf_ctx() || capable(cap);
1085}
1086EXPORT_SYMBOL(sockopt_capable);
1087
1da177e4
LT
1088/*
1089 * This is meant for all protocols to use and covers goings on
1090 * at the socket level. Everything here is generic.
1091 */
1092
29003875
MKL
1093int sk_setsockopt(struct sock *sk, int level, int optname,
1094 sockptr_t optval, unsigned int optlen)
1da177e4 1095{
d463126e 1096 struct so_timestamping timestamping;
4d748f99 1097 struct socket *sock = sk->sk_socket;
80b14dee 1098 struct sock_txtime sk_txtime;
1da177e4
LT
1099 int val;
1100 int valbool;
1101 struct linger ling;
1102 int ret = 0;
4ec93edb 1103
1da177e4
LT
1104 /*
1105 * Options without arguments
1106 */
1107
4878809f 1108 if (optname == SO_BINDTODEVICE)
c91f6df2 1109 return sock_setbindtodevice(sk, optval, optlen);
4878809f 1110
e71a4783
SH
1111 if (optlen < sizeof(int))
1112 return -EINVAL;
4ec93edb 1113
c8c1bbb6 1114 if (copy_from_sockptr(&val, optval, sizeof(val)))
1da177e4 1115 return -EFAULT;
4ec93edb 1116
2a91525c 1117 valbool = val ? 1 : 0;
1da177e4 1118
10bbf165
ED
1119 /* handle options which do not require locking the socket. */
1120 switch (optname) {
1121 case SO_PRIORITY:
1122 if ((val >= 0 && val <= 6) ||
1123 sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1124 sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1125 sock_set_priority(sk, val);
1126 return 0;
1127 }
1128 return -EPERM;
8ebfb6db
ED
1129 case SO_PASSSEC:
1130 assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
1131 return 0;
1132 case SO_PASSCRED:
1133 assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
1134 return 0;
1135 case SO_PASSPIDFD:
1136 assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
1137 return 0;
b1202515
ED
1138 case SO_TYPE:
1139 case SO_PROTOCOL:
1140 case SO_DOMAIN:
1141 case SO_ERROR:
1142 return -ENOPROTOOPT;
2a4319cf
ED
1143#ifdef CONFIG_NET_RX_BUSY_POLL
1144 case SO_BUSY_POLL:
1145 if (val < 0)
1146 return -EINVAL;
1147 WRITE_ONCE(sk->sk_ll_usec, val);
1148 return 0;
1149 case SO_PREFER_BUSY_POLL:
1150 if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1151 return -EPERM;
1152 WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1153 return 0;
1154 case SO_BUSY_POLL_BUDGET:
1155 if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
1156 !sockopt_capable(CAP_NET_ADMIN))
1157 return -EPERM;
1158 if (val < 0 || val > U16_MAX)
1159 return -EINVAL;
1160 WRITE_ONCE(sk->sk_busy_poll_budget, val);
1161 return 0;
1162#endif
28b24f90
ED
1163 case SO_MAX_PACING_RATE:
1164 {
1165 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1166 unsigned long pacing_rate;
1167
1168 if (sizeof(ulval) != sizeof(val) &&
1169 optlen >= sizeof(ulval) &&
1170 copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1171 return -EFAULT;
1172 }
1173 if (ulval != ~0UL)
1174 cmpxchg(&sk->sk_pacing_status,
1175 SK_PACING_NONE,
1176 SK_PACING_NEEDED);
1177 /* Pairs with READ_ONCE() from sk_getsockopt() */
1178 WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1179 pacing_rate = READ_ONCE(sk->sk_pacing_rate);
1180 if (ulval < pacing_rate)
1181 WRITE_ONCE(sk->sk_pacing_rate, ulval);
1182 return 0;
1183 }
5eef0b8d
ED
1184 case SO_TXREHASH:
1185 if (val < -1 || val > 1)
1186 return -EINVAL;
1187 if ((u8)val == SOCK_TXREHASH_DEFAULT)
1188 val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1189 /* Paired with READ_ONCE() in tcp_rtx_synack()
1190 * and sk_getsockopt().
1191 */
1192 WRITE_ONCE(sk->sk_txrehash, (u8)val);
1193 return 0;
10bbf165
ED
1194 }
1195
24426654 1196 sockopt_lock_sock(sk);
1da177e4 1197
2a91525c 1198 switch (optname) {
e71a4783 1199 case SO_DEBUG:
e42c7bee 1200 if (val && !sockopt_capable(CAP_NET_ADMIN))
e71a4783 1201 ret = -EACCES;
2a91525c 1202 else
c0ef877b 1203 sock_valbool_flag(sk, SOCK_DBG, valbool);
e71a4783
SH
1204 break;
1205 case SO_REUSEADDR:
cdb8744d 1206 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
e71a4783 1207 break;
055dc21a
TH
1208 case SO_REUSEPORT:
1209 sk->sk_reuseport = valbool;
1210 break;
e71a4783 1211 case SO_DONTROUTE:
c0ef877b 1212 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
0fbe82e6 1213 sk_dst_reset(sk);
e71a4783
SH
1214 break;
1215 case SO_BROADCAST:
1216 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1217 break;
1218 case SO_SNDBUF:
1219 /* Don't error on this BSD doesn't and if you think
82981930
ED
1220 * about it this is right. Otherwise apps have to
1221 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1222 * are treated in BSD as hints
1223 */
1227c177 1224 val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
b0573dea 1225set_sndbuf:
4057765f
GN
1226 /* Ensure val * 2 fits into an int, to prevent max_t()
1227 * from treating it as a negative value.
1228 */
1229 val = min_t(int, val, INT_MAX / 2);
e71a4783 1230 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
e292f05e
ED
1231 WRITE_ONCE(sk->sk_sndbuf,
1232 max_t(int, val * 2, SOCK_MIN_SNDBUF));
82981930 1233 /* Wake up sending tasks if we upped the value. */
e71a4783
SH
1234 sk->sk_write_space(sk);
1235 break;
1da177e4 1236
e71a4783 1237 case SO_SNDBUFFORCE:
e42c7bee 1238 if (!sockopt_capable(CAP_NET_ADMIN)) {
e71a4783
SH
1239 ret = -EPERM;
1240 break;
1241 }
4057765f
GN
1242
1243 /* No negative values (to prevent underflow, as val will be
1244 * multiplied by 2).
1245 */
1246 if (val < 0)
1247 val = 0;
e71a4783 1248 goto set_sndbuf;
b0573dea 1249
e71a4783
SH
1250 case SO_RCVBUF:
1251 /* Don't error on this BSD doesn't and if you think
82981930
ED
1252 * about it this is right. Otherwise apps have to
1253 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1254 * are treated in BSD as hints
1255 */
1227c177 1256 __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
e71a4783
SH
1257 break;
1258
1259 case SO_RCVBUFFORCE:
e42c7bee 1260 if (!sockopt_capable(CAP_NET_ADMIN)) {
e71a4783 1261 ret = -EPERM;
1da177e4 1262 break;
e71a4783 1263 }
4057765f
GN
1264
1265 /* No negative values (to prevent underflow, as val will be
1266 * multiplied by 2).
1267 */
26cfabf9
CH
1268 __sock_set_rcvbuf(sk, max(val, 0));
1269 break;
1da177e4 1270
e71a4783 1271 case SO_KEEPALIVE:
4b9d07a4
UB
1272 if (sk->sk_prot->keepalive)
1273 sk->sk_prot->keepalive(sk, valbool);
e71a4783
SH
1274 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1275 break;
1276
1277 case SO_OOBINLINE:
1278 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1279 break;
1280
1281 case SO_NO_CHECK:
28448b80 1282 sk->sk_no_check_tx = valbool;
e71a4783
SH
1283 break;
1284
e71a4783
SH
1285 case SO_LINGER:
1286 if (optlen < sizeof(ling)) {
1287 ret = -EINVAL; /* 1003.1g */
1da177e4 1288 break;
e71a4783 1289 }
c8c1bbb6 1290 if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
e71a4783 1291 ret = -EFAULT;
1da177e4 1292 break;
e71a4783 1293 }
bc1fb82a 1294 if (!ling.l_onoff) {
e71a4783 1295 sock_reset_flag(sk, SOCK_LINGER);
bc1fb82a
ED
1296 } else {
1297 unsigned long t_sec = ling.l_linger;
1298
1299 if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1300 WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1da177e4 1301 else
bc1fb82a 1302 WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
e71a4783
SH
1303 sock_set_flag(sk, SOCK_LINGER);
1304 }
1305 break;
1306
1307 case SO_BSDCOMPAT:
e71a4783
SH
1308 break;
1309
7f1bc6e9 1310 case SO_TIMESTAMP_OLD:
887feae3 1311 case SO_TIMESTAMP_NEW:
7f1bc6e9 1312 case SO_TIMESTAMPNS_OLD:
887feae3 1313 case SO_TIMESTAMPNS_NEW:
81b4a0cc 1314 sock_set_timestamp(sk, optname, valbool);
e71a4783 1315 break;
ced122d9 1316
9718475e 1317 case SO_TIMESTAMPING_NEW:
7f1bc6e9 1318 case SO_TIMESTAMPING_OLD:
d463126e
YL
1319 if (optlen == sizeof(timestamping)) {
1320 if (copy_from_sockptr(&timestamping, optval,
271dbc31
DC
1321 sizeof(timestamping))) {
1322 ret = -EFAULT;
1323 break;
1324 }
d463126e
YL
1325 } else {
1326 memset(&timestamping, 0, sizeof(timestamping));
1327 timestamping.flags = val;
1328 }
1329 ret = sock_set_timestamping(sk, optname, timestamping);
20d49473
PO
1330 break;
1331
e71a4783 1332 case SO_RCVLOWAT:
1ded5e5a
ED
1333 {
1334 int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
1335
e71a4783
SH
1336 if (val < 0)
1337 val = INT_MAX;
1ded5e5a
ED
1338 if (sock)
1339 set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1340 if (set_rcvlowat)
1341 ret = set_rcvlowat(sk, val);
d1361840 1342 else
eac66402 1343 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
e71a4783 1344 break;
1ded5e5a 1345 }
45bdc661 1346 case SO_RCVTIMEO_OLD:
a9beb86a 1347 case SO_RCVTIMEO_NEW:
c8c1bbb6 1348 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
c34645ac 1349 optlen, optname == SO_RCVTIMEO_OLD);
e71a4783
SH
1350 break;
1351
45bdc661 1352 case SO_SNDTIMEO_OLD:
a9beb86a 1353 case SO_SNDTIMEO_NEW:
c8c1bbb6 1354 ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
c34645ac 1355 optlen, optname == SO_SNDTIMEO_OLD);
e71a4783 1356 break;
1da177e4 1357
4d295e54
CH
1358 case SO_ATTACH_FILTER: {
1359 struct sock_fprog fprog;
e71a4783 1360
c8c1bbb6 1361 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
4d295e54 1362 if (!ret)
e71a4783 1363 ret = sk_attach_filter(&fprog, sk);
e71a4783 1364 break;
4d295e54 1365 }
89aa0758
AS
1366 case SO_ATTACH_BPF:
1367 ret = -EINVAL;
1368 if (optlen == sizeof(u32)) {
1369 u32 ufd;
1370
1371 ret = -EFAULT;
c8c1bbb6 1372 if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
89aa0758
AS
1373 break;
1374
1375 ret = sk_attach_bpf(ufd, sk);
1376 }
1377 break;
1378
4d295e54
CH
1379 case SO_ATTACH_REUSEPORT_CBPF: {
1380 struct sock_fprog fprog;
538950a1 1381
c8c1bbb6 1382 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
4d295e54 1383 if (!ret)
538950a1 1384 ret = sk_reuseport_attach_filter(&fprog, sk);
538950a1 1385 break;
4d295e54 1386 }
538950a1
CG
1387 case SO_ATTACH_REUSEPORT_EBPF:
1388 ret = -EINVAL;
1389 if (optlen == sizeof(u32)) {
1390 u32 ufd;
1391
1392 ret = -EFAULT;
c8c1bbb6 1393 if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
538950a1
CG
1394 break;
1395
1396 ret = sk_reuseport_attach_bpf(ufd, sk);
1397 }
1398 break;
1399
99f3a064
MKL
1400 case SO_DETACH_REUSEPORT_BPF:
1401 ret = reuseport_detach_prog(sk);
1402 break;
1403
e71a4783 1404 case SO_DETACH_FILTER:
55b33325 1405 ret = sk_detach_filter(sk);
e71a4783 1406 break;
1da177e4 1407
d59577b6
VB
1408 case SO_LOCK_FILTER:
1409 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1410 ret = -EPERM;
1411 else
1412 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1413 break;
1414
4a19ec58 1415 case SO_MARK:
e42c7bee
MKL
1416 if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1417 !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
4a19ec58 1418 ret = -EPERM;
dd9082f4 1419 break;
50254256 1420 }
dd9082f4
AA
1421
1422 __sock_set_mark(sk, val);
4a19ec58 1423 break;
6fd1d51c
EM
1424 case SO_RCVMARK:
1425 sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1426 break;
877ce7c1 1427
3b885787 1428 case SO_RXQ_OVFL:
8083f0fc 1429 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
3b885787 1430 break;
6e3e939f
JB
1431
1432 case SO_WIFI_STATUS:
1433 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1434 break;
1435
ef64a54f 1436 case SO_PEEK_OFF:
1ded5e5a
ED
1437 {
1438 int (*set_peek_off)(struct sock *sk, int val);
1439
1440 set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1441 if (set_peek_off)
1442 ret = set_peek_off(sk, val);
ef64a54f
PE
1443 else
1444 ret = -EOPNOTSUPP;
1445 break;
1ded5e5a 1446 }
3bdc0eba
BG
1447
1448 case SO_NOFCS:
1449 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1450 break;
1451
7d4c04fc
KJ
1452 case SO_SELECT_ERR_QUEUE:
1453 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1454 break;
1455
62748f32 1456
70da268b 1457 case SO_INCOMING_CPU:
b261eda8 1458 reuseport_update_incoming_cpu(sk, val);
70da268b
ED
1459 break;
1460
a87cb3e4
TH
1461 case SO_CNX_ADVICE:
1462 if (val == 1)
1463 dst_negative_advice(sk);
1464 break;
76851d12
WB
1465
1466 case SO_ZEROCOPY:
28190752 1467 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
42f67eea 1468 if (!(sk_is_tcp(sk) ||
b5947e5d
WB
1469 (sk->sk_type == SOCK_DGRAM &&
1470 sk->sk_protocol == IPPROTO_UDP)))
869420a8 1471 ret = -EOPNOTSUPP;
28190752 1472 } else if (sk->sk_family != PF_RDS) {
869420a8 1473 ret = -EOPNOTSUPP;
28190752
SV
1474 }
1475 if (!ret) {
1476 if (val < 0 || val > 1)
1477 ret = -EINVAL;
1478 else
1479 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
28190752 1480 }
334e6413
JSP
1481 break;
1482
80b14dee 1483 case SO_TXTIME:
790709f2 1484 if (optlen != sizeof(struct sock_txtime)) {
80b14dee 1485 ret = -EINVAL;
790709f2 1486 break;
c8c1bbb6 1487 } else if (copy_from_sockptr(&sk_txtime, optval,
80b14dee
RC
1488 sizeof(struct sock_txtime))) {
1489 ret = -EFAULT;
790709f2 1490 break;
80b14dee
RC
1491 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1492 ret = -EINVAL;
790709f2
ED
1493 break;
1494 }
1495 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1496 * scheduler has enough safe guards.
1497 */
1498 if (sk_txtime.clockid != CLOCK_MONOTONIC &&
e42c7bee 1499 !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
790709f2
ED
1500 ret = -EPERM;
1501 break;
80b14dee 1502 }
790709f2
ED
1503 sock_valbool_flag(sk, SOCK_TXTIME, true);
1504 sk->sk_clockid = sk_txtime.clockid;
1505 sk->sk_txtime_deadline_mode =
1506 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1507 sk->sk_txtime_report_errors =
1508 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
80b14dee
RC
1509 break;
1510
f5dd3d0c 1511 case SO_BINDTOIFINDEX:
7594888c 1512 ret = sock_bindtoindex_locked(sk, val);
f5dd3d0c
DR
1513 break;
1514
04190bf8
PT
1515 case SO_BUF_LOCK:
1516 if (val & ~SOCK_BUF_LOCK_MASK) {
1517 ret = -EINVAL;
1518 break;
1519 }
1520 sk->sk_userlocks = val | (sk->sk_userlocks &
1521 ~SOCK_BUF_LOCK_MASK);
1522 break;
1523
2bb2f5fb
WW
1524 case SO_RESERVE_MEM:
1525 {
1526 int delta;
1527
1528 if (val < 0) {
1529 ret = -EINVAL;
1530 break;
1531 }
1532
1533 delta = val - sk->sk_reserved_mem;
1534 if (delta < 0)
1535 sock_release_reserved_memory(sk, -delta);
1536 else
1537 ret = sock_reserve_memory(sk, delta);
1538 break;
1539 }
1540
e71a4783
SH
1541 default:
1542 ret = -ENOPROTOOPT;
1543 break;
4ec93edb 1544 }
24426654 1545 sockopt_release_sock(sk);
1da177e4
LT
1546 return ret;
1547}
4d748f99
MKL
1548
1549int sock_setsockopt(struct socket *sock, int level, int optname,
1550 sockptr_t optval, unsigned int optlen)
1551{
1552 return sk_setsockopt(sock->sk, level, optname,
1553 optval, optlen);
1554}
2a91525c 1555EXPORT_SYMBOL(sock_setsockopt);
1da177e4 1556
35306eb2
ED
1557static const struct cred *sk_get_peer_cred(struct sock *sk)
1558{
1559 const struct cred *cred;
1560
1561 spin_lock(&sk->sk_peer_lock);
1562 cred = get_cred(sk->sk_peer_cred);
1563 spin_unlock(&sk->sk_peer_lock);
1564
1565 return cred;
1566}
1da177e4 1567
8f09898b 1568static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1569 struct ucred *ucred)
3f551f94
EB
1570{
1571 ucred->pid = pid_vnr(pid);
1572 ucred->uid = ucred->gid = -1;
1573 if (cred) {
1574 struct user_namespace *current_ns = current_user_ns();
1575
b2e4f544
EB
1576 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1577 ucred->gid = from_kgid_munged(current_ns, cred->egid);
3f551f94
EB
1578 }
1579}
1580
4ff09db1 1581static int groups_to_user(sockptr_t dst, const struct group_info *src)
28b5ba2a
DR
1582{
1583 struct user_namespace *user_ns = current_user_ns();
1584 int i;
1585
4ff09db1
MKL
1586 for (i = 0; i < src->ngroups; i++) {
1587 gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1588
1589 if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
28b5ba2a 1590 return -EFAULT;
4ff09db1 1591 }
28b5ba2a
DR
1592
1593 return 0;
1594}
1595
65ddc82d
MKL
1596int sk_getsockopt(struct sock *sk, int level, int optname,
1597 sockptr_t optval, sockptr_t optlen)
1da177e4 1598{
ba74a760 1599 struct socket *sock = sk->sk_socket;
4ec93edb 1600
e71a4783 1601 union {
4ec93edb 1602 int val;
5daab9db 1603 u64 val64;
677f136c 1604 unsigned long ulval;
4ec93edb 1605 struct linger ling;
fe0c72f3
AB
1606 struct old_timeval32 tm32;
1607 struct __kernel_old_timeval tm;
a9beb86a 1608 struct __kernel_sock_timeval stm;
80b14dee 1609 struct sock_txtime txtime;
d463126e 1610 struct so_timestamping timestamping;
1da177e4 1611 } v;
4ec93edb 1612
4d0392be 1613 int lv = sizeof(int);
1da177e4 1614 int len;
4ec93edb 1615
4ff09db1 1616 if (copy_from_sockptr(&len, optlen, sizeof(int)))
4ec93edb 1617 return -EFAULT;
e71a4783 1618 if (len < 0)
1da177e4 1619 return -EINVAL;
4ec93edb 1620
50fee1de 1621 memset(&v, 0, sizeof(v));
df0bca04 1622
2a91525c 1623 switch (optname) {
e71a4783
SH
1624 case SO_DEBUG:
1625 v.val = sock_flag(sk, SOCK_DBG);
1626 break;
1627
1628 case SO_DONTROUTE:
1629 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1630 break;
1631
1632 case SO_BROADCAST:
1b23a5df 1633 v.val = sock_flag(sk, SOCK_BROADCAST);
e71a4783
SH
1634 break;
1635
1636 case SO_SNDBUF:
74bc0843 1637 v.val = READ_ONCE(sk->sk_sndbuf);
e71a4783
SH
1638 break;
1639
1640 case SO_RCVBUF:
b4b55325 1641 v.val = READ_ONCE(sk->sk_rcvbuf);
e71a4783
SH
1642 break;
1643
1644 case SO_REUSEADDR:
1645 v.val = sk->sk_reuse;
1646 break;
1647
055dc21a
TH
1648 case SO_REUSEPORT:
1649 v.val = sk->sk_reuseport;
1650 break;
1651
e71a4783 1652 case SO_KEEPALIVE:
1b23a5df 1653 v.val = sock_flag(sk, SOCK_KEEPOPEN);
e71a4783
SH
1654 break;
1655
1656 case SO_TYPE:
1657 v.val = sk->sk_type;
1658 break;
1659
49c794e9
JE
1660 case SO_PROTOCOL:
1661 v.val = sk->sk_protocol;
1662 break;
1663
0d6038ee
JE
1664 case SO_DOMAIN:
1665 v.val = sk->sk_family;
1666 break;
1667
e71a4783
SH
1668 case SO_ERROR:
1669 v.val = -sock_error(sk);
2a91525c 1670 if (v.val == 0)
e71a4783
SH
1671 v.val = xchg(&sk->sk_err_soft, 0);
1672 break;
1673
1674 case SO_OOBINLINE:
1b23a5df 1675 v.val = sock_flag(sk, SOCK_URGINLINE);
e71a4783
SH
1676 break;
1677
1678 case SO_NO_CHECK:
28448b80 1679 v.val = sk->sk_no_check_tx;
e71a4783
SH
1680 break;
1681
1682 case SO_PRIORITY:
8bf43be7 1683 v.val = READ_ONCE(sk->sk_priority);
e71a4783
SH
1684 break;
1685
1686 case SO_LINGER:
1687 lv = sizeof(v.ling);
1b23a5df 1688 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
bc1fb82a 1689 v.ling.l_linger = READ_ONCE(sk->sk_lingertime) / HZ;
e71a4783
SH
1690 break;
1691
1692 case SO_BSDCOMPAT:
e71a4783
SH
1693 break;
1694
7f1bc6e9 1695 case SO_TIMESTAMP_OLD:
92f37fd2 1696 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
887feae3 1697 !sock_flag(sk, SOCK_TSTAMP_NEW) &&
92f37fd2
ED
1698 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1699 break;
1700
7f1bc6e9 1701 case SO_TIMESTAMPNS_OLD:
887feae3
DD
1702 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1703 break;
1704
1705 case SO_TIMESTAMP_NEW:
1706 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1707 break;
1708
1709 case SO_TIMESTAMPNS_NEW:
1710 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
e71a4783
SH
1711 break;
1712
7f1bc6e9 1713 case SO_TIMESTAMPING_OLD:
d463126e 1714 lv = sizeof(v.timestamping);
e3390b30 1715 v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
251cd405 1716 v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
20d49473
PO
1717 break;
1718
a9beb86a
DD
1719 case SO_RCVTIMEO_OLD:
1720 case SO_RCVTIMEO_NEW:
285975dd
ED
1721 lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1722 SO_RCVTIMEO_OLD == optname);
e71a4783
SH
1723 break;
1724
a9beb86a
DD
1725 case SO_SNDTIMEO_OLD:
1726 case SO_SNDTIMEO_NEW:
285975dd
ED
1727 lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1728 SO_SNDTIMEO_OLD == optname);
e71a4783 1729 break;
1da177e4 1730
e71a4783 1731 case SO_RCVLOWAT:
e6d12bdb 1732 v.val = READ_ONCE(sk->sk_rcvlowat);
e71a4783 1733 break;
1da177e4 1734
e71a4783 1735 case SO_SNDLOWAT:
2a91525c 1736 v.val = 1;
e71a4783 1737 break;
1da177e4 1738
e71a4783 1739 case SO_PASSCRED:
82981930 1740 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
e71a4783 1741 break;
1da177e4 1742
5e2ff670
AM
1743 case SO_PASSPIDFD:
1744 v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
1745 break;
1746
e71a4783 1747 case SO_PEERCRED:
109f6e39
EB
1748 {
1749 struct ucred peercred;
1750 if (len > sizeof(peercred))
1751 len = sizeof(peercred);
35306eb2
ED
1752
1753 spin_lock(&sk->sk_peer_lock);
109f6e39 1754 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
35306eb2
ED
1755 spin_unlock(&sk->sk_peer_lock);
1756
4ff09db1 1757 if (copy_to_sockptr(optval, &peercred, len))
e71a4783
SH
1758 return -EFAULT;
1759 goto lenout;
109f6e39 1760 }
1da177e4 1761
7b26952a
AM
1762 case SO_PEERPIDFD:
1763 {
1764 struct pid *peer_pid;
1765 struct file *pidfd_file = NULL;
1766 int pidfd;
1767
1768 if (len > sizeof(pidfd))
1769 len = sizeof(pidfd);
1770
1771 spin_lock(&sk->sk_peer_lock);
1772 peer_pid = get_pid(sk->sk_peer_pid);
1773 spin_unlock(&sk->sk_peer_lock);
1774
1775 if (!peer_pid)
b6f79e82 1776 return -ENODATA;
7b26952a
AM
1777
1778 pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
1779 put_pid(peer_pid);
1780 if (pidfd < 0)
1781 return pidfd;
1782
1783 if (copy_to_sockptr(optval, &pidfd, len) ||
1784 copy_to_sockptr(optlen, &len, sizeof(int))) {
1785 put_unused_fd(pidfd);
1786 fput(pidfd_file);
1787
1788 return -EFAULT;
1789 }
1790
1791 fd_install(pidfd, pidfd_file);
1792 return 0;
1793 }
1794
28b5ba2a
DR
1795 case SO_PEERGROUPS:
1796 {
35306eb2 1797 const struct cred *cred;
28b5ba2a
DR
1798 int ret, n;
1799
35306eb2
ED
1800 cred = sk_get_peer_cred(sk);
1801 if (!cred)
28b5ba2a
DR
1802 return -ENODATA;
1803
35306eb2 1804 n = cred->group_info->ngroups;
28b5ba2a
DR
1805 if (len < n * sizeof(gid_t)) {
1806 len = n * sizeof(gid_t);
35306eb2 1807 put_cred(cred);
4ff09db1 1808 return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
28b5ba2a
DR
1809 }
1810 len = n * sizeof(gid_t);
1811
4ff09db1 1812 ret = groups_to_user(optval, cred->group_info);
35306eb2 1813 put_cred(cred);
28b5ba2a
DR
1814 if (ret)
1815 return ret;
1816 goto lenout;
1817 }
1818
e71a4783
SH
1819 case SO_PEERNAME:
1820 {
8936bf53 1821 struct sockaddr_storage address;
e71a4783 1822
1ded5e5a 1823 lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
9b2c45d4 1824 if (lv < 0)
e71a4783
SH
1825 return -ENOTCONN;
1826 if (lv < len)
1827 return -EINVAL;
8936bf53 1828 if (copy_to_sockptr(optval, &address, len))
e71a4783
SH
1829 return -EFAULT;
1830 goto lenout;
1831 }
1da177e4 1832
e71a4783
SH
1833 /* Dubious BSD thing... Probably nobody even uses it, but
1834 * the UNIX standard wants it for whatever reason... -DaveM
1835 */
1836 case SO_ACCEPTCONN:
1837 v.val = sk->sk_state == TCP_LISTEN;
1838 break;
1da177e4 1839
e71a4783 1840 case SO_PASSSEC:
82981930 1841 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
e71a4783 1842 break;
877ce7c1 1843
e71a4783 1844 case SO_PEERSEC:
b10b9c34
PM
1845 return security_socket_getpeersec_stream(sock,
1846 optval, optlen, len);
1da177e4 1847
4a19ec58 1848 case SO_MARK:
3c5b4d69 1849 v.val = READ_ONCE(sk->sk_mark);
4a19ec58
LAT
1850 break;
1851
6fd1d51c
EM
1852 case SO_RCVMARK:
1853 v.val = sock_flag(sk, SOCK_RCVMARK);
1854 break;
1855
3b885787 1856 case SO_RXQ_OVFL:
1b23a5df 1857 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
3b885787
NH
1858 break;
1859
6e3e939f 1860 case SO_WIFI_STATUS:
1b23a5df 1861 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
6e3e939f
JB
1862 break;
1863
ef64a54f 1864 case SO_PEEK_OFF:
1ded5e5a 1865 if (!READ_ONCE(sock->ops)->set_peek_off)
ef64a54f
PE
1866 return -EOPNOTSUPP;
1867
11695c6e 1868 v.val = READ_ONCE(sk->sk_peek_off);
ef64a54f 1869 break;
bc2f7996 1870 case SO_NOFCS:
1b23a5df 1871 v.val = sock_flag(sk, SOCK_NOFCS);
bc2f7996 1872 break;
c91f6df2 1873
f7b86bfe 1874 case SO_BINDTODEVICE:
c91f6df2
BH
1875 return sock_getbindtodevice(sk, optval, optlen, len);
1876
a8fc9277 1877 case SO_GET_FILTER:
4ff09db1 1878 len = sk_get_filter(sk, optval, len);
a8fc9277
PE
1879 if (len < 0)
1880 return len;
1881
1882 goto lenout;
c91f6df2 1883
d59577b6
VB
1884 case SO_LOCK_FILTER:
1885 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1886 break;
1887
ea02f941
MS
1888 case SO_BPF_EXTENSIONS:
1889 v.val = bpf_tell_extensions();
1890 break;
1891
7d4c04fc
KJ
1892 case SO_SELECT_ERR_QUEUE:
1893 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1894 break;
1895
e0d1095a 1896#ifdef CONFIG_NET_RX_BUSY_POLL
64b0dc51 1897 case SO_BUSY_POLL:
e5f0d2dd 1898 v.val = READ_ONCE(sk->sk_ll_usec);
dafcc438 1899 break;
7fd3253a
BT
1900 case SO_PREFER_BUSY_POLL:
1901 v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1902 break;
dafcc438
ET
1903#endif
1904
62748f32 1905 case SO_MAX_PACING_RATE:
ea7f45ef 1906 /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
677f136c
ED
1907 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1908 lv = sizeof(v.ulval);
ea7f45ef 1909 v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
677f136c
ED
1910 } else {
1911 /* 32bit version */
ea7f45ef
ED
1912 v.val = min_t(unsigned long, ~0U,
1913 READ_ONCE(sk->sk_max_pacing_rate));
677f136c 1914 }
62748f32
ED
1915 break;
1916
2c8c56e1 1917 case SO_INCOMING_CPU:
7170a977 1918 v.val = READ_ONCE(sk->sk_incoming_cpu);
2c8c56e1
ED
1919 break;
1920
a2d133b1
JH
1921 case SO_MEMINFO:
1922 {
1923 u32 meminfo[SK_MEMINFO_VARS];
1924
a2d133b1
JH
1925 sk_get_meminfo(sk, meminfo);
1926
1927 len = min_t(unsigned int, len, sizeof(meminfo));
4ff09db1 1928 if (copy_to_sockptr(optval, &meminfo, len))
a2d133b1
JH
1929 return -EFAULT;
1930
1931 goto lenout;
1932 }
6d433902
SS
1933
1934#ifdef CONFIG_NET_RX_BUSY_POLL
1935 case SO_INCOMING_NAPI_ID:
1936 v.val = READ_ONCE(sk->sk_napi_id);
1937
1938 /* aggregate non-NAPI IDs down to 0 */
1939 if (v.val < MIN_NAPI_ID)
1940 v.val = 0;
1941
1942 break;
1943#endif
1944
5daab9db
CF
1945 case SO_COOKIE:
1946 lv = sizeof(u64);
1947 if (len < lv)
1948 return -EINVAL;
1949 v.val64 = sock_gen_cookie(sk);
1950 break;
1951
76851d12
WB
1952 case SO_ZEROCOPY:
1953 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1954 break;
1955
80b14dee
RC
1956 case SO_TXTIME:
1957 lv = sizeof(v.txtime);
1958 v.txtime.clockid = sk->sk_clockid;
1959 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1960 SOF_TXTIME_DEADLINE_MODE : 0;
4b15c707
JSP
1961 v.txtime.flags |= sk->sk_txtime_report_errors ?
1962 SOF_TXTIME_REPORT_ERRORS : 0;
80b14dee
RC
1963 break;
1964
f5dd3d0c 1965 case SO_BINDTOIFINDEX:
e5fccaa1 1966 v.val = READ_ONCE(sk->sk_bound_dev_if);
f5dd3d0c
DR
1967 break;
1968
e8b9eab9
MP
1969 case SO_NETNS_COOKIE:
1970 lv = sizeof(u64);
1971 if (len != lv)
1972 return -EINVAL;
1973 v.val64 = sock_net(sk)->net_cookie;
1974 break;
1975
04190bf8
PT
1976 case SO_BUF_LOCK:
1977 v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1978 break;
1979
2bb2f5fb 1980 case SO_RESERVE_MEM:
fe11fdcb 1981 v.val = READ_ONCE(sk->sk_reserved_mem);
2bb2f5fb
WW
1982 break;
1983
26859240 1984 case SO_TXREHASH:
c76a0328
ED
1985 /* Paired with WRITE_ONCE() in sk_setsockopt() */
1986 v.val = READ_ONCE(sk->sk_txrehash);
26859240
AK
1987 break;
1988
e71a4783 1989 default:
443b5991
YH
1990 /* We implement the SO_SNDLOWAT etc to not be settable
1991 * (1003.1g 7).
1992 */
e71a4783 1993 return -ENOPROTOOPT;
1da177e4 1994 }
e71a4783 1995
1da177e4
LT
1996 if (len > lv)
1997 len = lv;
4ff09db1 1998 if (copy_to_sockptr(optval, &v, len))
1da177e4
LT
1999 return -EFAULT;
2000lenout:
4ff09db1 2001 if (copy_to_sockptr(optlen, &len, sizeof(int)))
4ec93edb
YH
2002 return -EFAULT;
2003 return 0;
1da177e4
LT
2004}
2005
a5b5bb9a
IM
2006/*
2007 * Initialize an sk_lock.
2008 *
2009 * (We also register the sk_lock with the lock validator.)
2010 */
b6f99a21 2011static inline void sock_lock_init(struct sock *sk)
a5b5bb9a 2012{
cdfbabfb
DH
2013 if (sk->sk_kern_sock)
2014 sock_lock_init_class_and_name(
2015 sk,
2016 af_family_kern_slock_key_strings[sk->sk_family],
2017 af_family_kern_slock_keys + sk->sk_family,
2018 af_family_kern_key_strings[sk->sk_family],
2019 af_family_kern_keys + sk->sk_family);
2020 else
2021 sock_lock_init_class_and_name(
2022 sk,
ed07536e
PZ
2023 af_family_slock_key_strings[sk->sk_family],
2024 af_family_slock_keys + sk->sk_family,
2025 af_family_key_strings[sk->sk_family],
2026 af_family_keys + sk->sk_family);
a5b5bb9a
IM
2027}
2028
4dc6dc71
ED
2029/*
2030 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2031 * even temporarly, because of RCU lookups. sk_node should also be left as is.
68835aba 2032 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
4dc6dc71 2033 */
f1a6c4da
PE
2034static void sock_copy(struct sock *nsk, const struct sock *osk)
2035{
b8e202d1 2036 const struct proto *prot = READ_ONCE(osk->sk_prot);
f1a6c4da
PE
2037#ifdef CONFIG_SECURITY_NETWORK
2038 void *sptr = nsk->sk_security;
2039#endif
df610cd9
KI
2040
2041 /* If we move sk_tx_queue_mapping out of the private section,
2042 * we must check if sk_tx_queue_clear() is called after
2043 * sock_copy() in sk_clone_lock().
2044 */
2045 BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2046 offsetof(struct sock, sk_dontcopy_begin) ||
2047 offsetof(struct sock, sk_tx_queue_mapping) >=
2048 offsetof(struct sock, sk_dontcopy_end));
2049
68835aba
ED
2050 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2051
2052 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
b8e202d1 2053 prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
68835aba 2054
f1a6c4da
PE
2055#ifdef CONFIG_SECURITY_NETWORK
2056 nsk->sk_security = sptr;
2057 security_sk_clone(osk, nsk);
2058#endif
2059}
2060
2e4afe7b
PE
2061static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2062 int family)
c308c1b2
PE
2063{
2064 struct sock *sk;
2065 struct kmem_cache *slab;
2066
2067 slab = prot->slab;
e912b114
ED
2068 if (slab != NULL) {
2069 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2070 if (!sk)
2071 return sk;
6471384a 2072 if (want_init_on_alloc(priority))
ba2489b0 2073 sk_prot_clear_nulls(sk, prot->obj_size);
fcbdf09d 2074 } else
c308c1b2
PE
2075 sk = kmalloc(prot->obj_size, priority);
2076
2e4afe7b
PE
2077 if (sk != NULL) {
2078 if (security_sk_alloc(sk, family, priority))
2079 goto out_free;
2080
2081 if (!try_module_get(prot->owner))
2082 goto out_free_sec;
2083 }
2084
c308c1b2 2085 return sk;
2e4afe7b
PE
2086
2087out_free_sec:
2088 security_sk_free(sk);
2089out_free:
2090 if (slab != NULL)
2091 kmem_cache_free(slab, sk);
2092 else
2093 kfree(sk);
2094 return NULL;
c308c1b2
PE
2095}
2096
2097static void sk_prot_free(struct proto *prot, struct sock *sk)
2098{
2099 struct kmem_cache *slab;
2e4afe7b 2100 struct module *owner;
c308c1b2 2101
2e4afe7b 2102 owner = prot->owner;
c308c1b2 2103 slab = prot->slab;
2e4afe7b 2104
bd1060a1 2105 cgroup_sk_free(&sk->sk_cgrp_data);
2d758073 2106 mem_cgroup_sk_free(sk);
2e4afe7b 2107 security_sk_free(sk);
c308c1b2
PE
2108 if (slab != NULL)
2109 kmem_cache_free(slab, sk);
2110 else
2111 kfree(sk);
2e4afe7b 2112 module_put(owner);
c308c1b2
PE
2113}
2114
1da177e4
LT
2115/**
2116 * sk_alloc - All socket objects are allocated here
c4ea43c5 2117 * @net: the applicable net namespace
4dc3b16b
PP
2118 * @family: protocol family
2119 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2120 * @prot: struct proto associated with this new sock instance
11aa9c28 2121 * @kern: is this to be a kernel socket?
1da177e4 2122 */
1b8d7ae4 2123struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
11aa9c28 2124 struct proto *prot, int kern)
1da177e4 2125{
c308c1b2 2126 struct sock *sk;
1da177e4 2127
154adbc8 2128 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1da177e4 2129 if (sk) {
154adbc8
PE
2130 sk->sk_family = family;
2131 /*
2132 * See comment in struct sock definition to understand
2133 * why we need sk_prot_creator -acme
2134 */
2135 sk->sk_prot = sk->sk_prot_creator = prot;
cdfbabfb 2136 sk->sk_kern_sock = kern;
154adbc8 2137 sock_lock_init(sk);
26abe143 2138 sk->sk_net_refcnt = kern ? 0 : 1;
648845ab 2139 if (likely(sk->sk_net_refcnt)) {
ffa84b5f 2140 get_net_track(net, &sk->ns_tracker, priority);
648845ab 2141 sock_inuse_add(net, 1);
0cafd77d
ED
2142 } else {
2143 __netns_tracker_alloc(net, &sk->ns_tracker,
2144 false, priority);
648845ab
TZ
2145 }
2146
26abe143 2147 sock_net_set(sk, net);
14afee4b 2148 refcount_set(&sk->sk_wmem_alloc, 1);
f8451725 2149
2d758073 2150 mem_cgroup_sk_alloc(sk);
d979a39d 2151 cgroup_sk_alloc(&sk->sk_cgrp_data);
2a56a1fe
TH
2152 sock_update_classid(&sk->sk_cgrp_data);
2153 sock_update_netprioidx(&sk->sk_cgrp_data);
41b14fb8 2154 sk_tx_queue_clear(sk);
1da177e4 2155 }
a79af59e 2156
2e4afe7b 2157 return sk;
1da177e4 2158}
2a91525c 2159EXPORT_SYMBOL(sk_alloc);
1da177e4 2160
a4298e45
ED
2161/* Sockets having SOCK_RCU_FREE will call this function after one RCU
2162 * grace period. This is the case for UDP sockets and TCP listeners.
2163 */
2164static void __sk_destruct(struct rcu_head *head)
1da177e4 2165{
a4298e45 2166 struct sock *sk = container_of(head, struct sock, sk_rcu);
1da177e4 2167 struct sk_filter *filter;
1da177e4
LT
2168
2169 if (sk->sk_destruct)
2170 sk->sk_destruct(sk);
2171
a898def2 2172 filter = rcu_dereference_check(sk->sk_filter,
14afee4b 2173 refcount_read(&sk->sk_wmem_alloc) == 0);
1da177e4 2174 if (filter) {
309dd5fc 2175 sk_filter_uncharge(sk, filter);
a9b3cd7f 2176 RCU_INIT_POINTER(sk->sk_filter, NULL);
1da177e4
LT
2177 }
2178
08e29af3 2179 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1da177e4 2180
6ac99e8f
MKL
2181#ifdef CONFIG_BPF_SYSCALL
2182 bpf_sk_storage_free(sk);
2183#endif
2184
1da177e4 2185 if (atomic_read(&sk->sk_omem_alloc))
e005d193
JP
2186 pr_debug("%s: optmem leakage (%d bytes) detected\n",
2187 __func__, atomic_read(&sk->sk_omem_alloc));
1da177e4 2188
22a0e18e
ED
2189 if (sk->sk_frag.page) {
2190 put_page(sk->sk_frag.page);
2191 sk->sk_frag.page = NULL;
2192 }
2193
35306eb2
ED
2194 /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2195 put_cred(sk->sk_peer_cred);
109f6e39 2196 put_pid(sk->sk_peer_pid);
35306eb2 2197
26abe143 2198 if (likely(sk->sk_net_refcnt))
ffa84b5f 2199 put_net_track(sock_net(sk), &sk->ns_tracker);
0cafd77d
ED
2200 else
2201 __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
2202
c308c1b2 2203 sk_prot_free(sk->sk_prot_creator, sk);
1da177e4 2204}
2b85a34e 2205
a4298e45
ED
2206void sk_destruct(struct sock *sk)
2207{
8c7138b3
MKL
2208 bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2209
2210 if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2211 reuseport_detach_sock(sk);
2212 use_call_rcu = true;
2213 }
2214
2215 if (use_call_rcu)
a4298e45
ED
2216 call_rcu(&sk->sk_rcu, __sk_destruct);
2217 else
2218 __sk_destruct(&sk->sk_rcu);
2219}
2220
eb4cb008
CG
2221static void __sk_free(struct sock *sk)
2222{
648845ab
TZ
2223 if (likely(sk->sk_net_refcnt))
2224 sock_inuse_add(sock_net(sk), -1);
2225
9709020c 2226 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
eb4cb008
CG
2227 sock_diag_broadcast_destroy(sk);
2228 else
2229 sk_destruct(sk);
2230}
2231
2b85a34e
ED
2232void sk_free(struct sock *sk)
2233{
2234 /*
25985edc 2235 * We subtract one from sk_wmem_alloc and can know if
2b85a34e
ED
2236 * some packets are still in some tx queue.
2237 * If not null, sock_wfree() will call __sk_free(sk) later
2238 */
14afee4b 2239 if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2b85a34e
ED
2240 __sk_free(sk);
2241}
2a91525c 2242EXPORT_SYMBOL(sk_free);
1da177e4 2243
581319c5
PA
2244static void sk_init_common(struct sock *sk)
2245{
2246 skb_queue_head_init(&sk->sk_receive_queue);
2247 skb_queue_head_init(&sk->sk_write_queue);
2248 skb_queue_head_init(&sk->sk_error_queue);
2249
2250 rwlock_init(&sk->sk_callback_lock);
2251 lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2252 af_rlock_keys + sk->sk_family,
2253 af_family_rlock_key_strings[sk->sk_family]);
2254 lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2255 af_wlock_keys + sk->sk_family,
2256 af_family_wlock_key_strings[sk->sk_family]);
2257 lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2258 af_elock_keys + sk->sk_family,
2259 af_family_elock_key_strings[sk->sk_family]);
2260 lockdep_set_class_and_name(&sk->sk_callback_lock,
2261 af_callback_keys + sk->sk_family,
2262 af_family_clock_key_strings[sk->sk_family]);
2263}
2264
e56c57d0
ED
2265/**
2266 * sk_clone_lock - clone a socket, and lock its clone
2267 * @sk: the socket to clone
2268 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2269 *
2270 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2271 */
2272struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
87d11ceb 2273{
b8e202d1 2274 struct proto *prot = READ_ONCE(sk->sk_prot);
bbc20b70 2275 struct sk_filter *filter;
278571ba 2276 bool is_charged = true;
bbc20b70 2277 struct sock *newsk;
87d11ceb 2278
b8e202d1 2279 newsk = sk_prot_alloc(prot, priority, sk->sk_family);
bbc20b70
ED
2280 if (!newsk)
2281 goto out;
87d11ceb 2282
bbc20b70 2283 sock_copy(newsk, sk);
9d538fa6 2284
bbc20b70 2285 newsk->sk_prot_creator = prot;
87d11ceb 2286
bbc20b70 2287 /* SANITY */
938cca9e 2288 if (likely(newsk->sk_net_refcnt)) {
ffa84b5f 2289 get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
938cca9e 2290 sock_inuse_add(sock_net(newsk), 1);
0cafd77d
ED
2291 } else {
2292 /* Kernel sockets are not elevating the struct net refcount.
2293 * Instead, use a tracker to more easily detect if a layer
2294 * is not properly dismantling its kernel sockets at netns
2295 * destroy time.
2296 */
2297 __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2298 false, priority);
938cca9e 2299 }
bbc20b70
ED
2300 sk_node_init(&newsk->sk_node);
2301 sock_lock_init(newsk);
2302 bh_lock_sock(newsk);
2303 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
2304 newsk->sk_backlog.len = 0;
87d11ceb 2305
bbc20b70 2306 atomic_set(&newsk->sk_rmem_alloc, 0);
87d11ceb 2307
bbc20b70
ED
2308 /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2309 refcount_set(&newsk->sk_wmem_alloc, 1);
d752a498 2310
bbc20b70
ED
2311 atomic_set(&newsk->sk_omem_alloc, 0);
2312 sk_init_common(newsk);
d752a498 2313
bbc20b70
ED
2314 newsk->sk_dst_cache = NULL;
2315 newsk->sk_dst_pending_confirm = 0;
2316 newsk->sk_wmem_queued = 0;
2317 newsk->sk_forward_alloc = 0;
2bb2f5fb 2318 newsk->sk_reserved_mem = 0;
bbc20b70
ED
2319 atomic_set(&newsk->sk_drops, 0);
2320 newsk->sk_send_head = NULL;
2321 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2322 atomic_set(&newsk->sk_zckey, 0);
87d11ceb 2323
bbc20b70 2324 sock_reset_flag(newsk, SOCK_DONE);
87d11ceb 2325
bbc20b70
ED
2326 /* sk->sk_memcg will be populated at accept() time */
2327 newsk->sk_memcg = NULL;
8f51dfc7 2328
bbc20b70 2329 cgroup_sk_clone(&newsk->sk_cgrp_data);
87d11ceb 2330
bbc20b70
ED
2331 rcu_read_lock();
2332 filter = rcu_dereference(sk->sk_filter);
2333 if (filter != NULL)
2334 /* though it's an empty new sock, the charging may fail
2335 * if sysctl_optmem_max was changed between creation of
2336 * original socket and cloning
2337 */
2338 is_charged = sk_filter_charge(newsk, filter);
2339 RCU_INIT_POINTER(newsk->sk_filter, filter);
2340 rcu_read_unlock();
2341
2342 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2343 /* We need to make sure that we don't uncharge the new
2344 * socket if we couldn't charge it in the first place
2345 * as otherwise we uncharge the parent's filter.
f1ff5ce2 2346 */
bbc20b70
ED
2347 if (!is_charged)
2348 RCU_INIT_POINTER(newsk->sk_filter, NULL);
2349 sk_free_unlock_clone(newsk);
2350 newsk = NULL;
2351 goto out;
2352 }
2353 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
f1ff5ce2 2354
bbc20b70
ED
2355 if (bpf_sk_storage_clone(sk, newsk)) {
2356 sk_free_unlock_clone(newsk);
2357 newsk = NULL;
2358 goto out;
2359 }
d979a39d 2360
bbc20b70
ED
2361 /* Clear sk_user_data if parent had the pointer tagged
2362 * as not suitable for copying when cloning.
2363 */
2364 if (sk_user_data_is_nocopy(newsk))
2365 newsk->sk_user_data = NULL;
2366
2367 newsk->sk_err = 0;
2368 newsk->sk_err_soft = 0;
2369 newsk->sk_priority = 0;
2370 newsk->sk_incoming_cpu = raw_smp_processor_id();
bbc20b70
ED
2371
2372 /* Before updating sk_refcnt, we must commit prior changes to memory
2373 * (Documentation/RCU/rculist_nulls.rst for details)
2374 */
2375 smp_wmb();
2376 refcount_set(&newsk->sk_refcnt, 2);
87d11ceb 2377
bbc20b70
ED
2378 sk_set_socket(newsk, NULL);
2379 sk_tx_queue_clear(newsk);
2380 RCU_INIT_POINTER(newsk->sk_wq, NULL);
87d11ceb 2381
bbc20b70
ED
2382 if (newsk->sk_prot->sockets_allocated)
2383 sk_sockets_allocated_inc(newsk);
704da560 2384
bbc20b70
ED
2385 if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2386 net_enable_timestamp();
87d11ceb
ACM
2387out:
2388 return newsk;
2389}
e56c57d0 2390EXPORT_SYMBOL_GPL(sk_clone_lock);
87d11ceb 2391
94352d45
ACM
2392void sk_free_unlock_clone(struct sock *sk)
2393{
2394 /* It is still raw copy of parent, so invalidate
2395 * destructor and make plain sk_free() */
2396 sk->sk_destruct = NULL;
2397 bh_unlock_sock(sk);
2398 sk_free(sk);
2399}
2400EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2401
b1a78b9b 2402static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
7c4e983c 2403{
b1a78b9b
XL
2404 bool is_ipv6 = false;
2405 u32 max_size;
2406
7c4e983c 2407#if IS_ENABLED(CONFIG_IPV6)
b1a78b9b
XL
2408 is_ipv6 = (sk->sk_family == AF_INET6 &&
2409 !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
7c4e983c 2410#endif
b1a78b9b
XL
2411 /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2412 max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
2413 READ_ONCE(dst->dev->gso_ipv4_max_size);
2414 if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2415 max_size = GSO_LEGACY_MAX_SIZE;
2416
2417 return max_size - (MAX_TCP_HEADER + 1);
7c4e983c
AD
2418}
2419
9958089a
AK
2420void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2421{
d6a4e26a
ED
2422 u32 max_segs = 1;
2423
d0d598ca
ED
2424 sk->sk_route_caps = dst->dev->features;
2425 if (sk_is_tcp(sk))
2426 sk->sk_route_caps |= NETIF_F_GSO;
9958089a 2427 if (sk->sk_route_caps & NETIF_F_GSO)
4fcd6b99 2428 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
aba54656
ED
2429 if (unlikely(sk->sk_gso_disabled))
2430 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
9958089a 2431 if (sk_can_gso(sk)) {
f70f250a 2432 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
9958089a 2433 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
82cc1a7a 2434 } else {
9958089a 2435 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
b1a78b9b 2436 sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
6d872df3
ED
2437 /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2438 max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
82cc1a7a 2439 }
9958089a 2440 }
d6a4e26a 2441 sk->sk_gso_max_segs = max_segs;
448a5ce1 2442 sk_dst_set(sk, dst);
9958089a
AK
2443}
2444EXPORT_SYMBOL_GPL(sk_setup_caps);
2445
1da177e4
LT
2446/*
2447 * Simple resource managers for sockets.
2448 */
2449
2450
4ec93edb
YH
2451/*
2452 * Write buffer destructor automatically called from kfree_skb.
1da177e4
LT
2453 */
2454void sock_wfree(struct sk_buff *skb)
2455{
2456 struct sock *sk = skb->sk;
d99927f4 2457 unsigned int len = skb->truesize;
052ada09 2458 bool free;
1da177e4 2459
d99927f4 2460 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
052ada09
PB
2461 if (sock_flag(sk, SOCK_RCU_FREE) &&
2462 sk->sk_write_space == sock_def_write_space) {
2463 rcu_read_lock();
2464 free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
0a8afd9f 2465 sock_def_write_space_wfree(sk);
052ada09
PB
2466 rcu_read_unlock();
2467 if (unlikely(free))
2468 __sk_free(sk);
2469 return;
2470 }
2471
d99927f4
ED
2472 /*
2473 * Keep a reference on sk_wmem_alloc, this will be released
2474 * after sk_write_space() call
2475 */
14afee4b 2476 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1da177e4 2477 sk->sk_write_space(sk);
d99927f4
ED
2478 len = 1;
2479 }
2b85a34e 2480 /*
d99927f4
ED
2481 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2482 * could not do because of in-flight packets
2b85a34e 2483 */
14afee4b 2484 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2b85a34e 2485 __sk_free(sk);
1da177e4 2486}
2a91525c 2487EXPORT_SYMBOL(sock_wfree);
1da177e4 2488
1d2077ac
ED
2489/* This variant of sock_wfree() is used by TCP,
2490 * since it sets SOCK_USE_WRITE_QUEUE.
2491 */
2492void __sock_wfree(struct sk_buff *skb)
2493{
2494 struct sock *sk = skb->sk;
2495
14afee4b 2496 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1d2077ac
ED
2497 __sk_free(sk);
2498}
2499
9e17f8a4
ED
2500void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2501{
2502 skb_orphan(skb);
2503 skb->sk = sk;
2504#ifdef CONFIG_INET
2505 if (unlikely(!sk_fullsock(sk))) {
2506 skb->destructor = sock_edemux;
2507 sock_hold(sk);
2508 return;
2509 }
2510#endif
2511 skb->destructor = sock_wfree;
2512 skb_set_hash_from_sk(skb, sk);
2513 /*
2514 * We used to take a refcount on sk, but following operation
2515 * is enough to guarantee sk_free() wont free this sock until
2516 * all in-flight packets are completed
2517 */
14afee4b 2518 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
9e17f8a4
ED
2519}
2520EXPORT_SYMBOL(skb_set_owner_w);
2521
41477662
JK
2522static bool can_skb_orphan_partial(const struct sk_buff *skb)
2523{
2524#ifdef CONFIG_TLS_DEVICE
2525 /* Drivers depend on in-order delivery for crypto offload,
2526 * partial orphan breaks out-of-order-OK logic.
2527 */
2528 if (skb->decrypted)
2529 return false;
2530#endif
2531 return (skb->destructor == sock_wfree ||
2532 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2533}
2534
1d2077ac
ED
2535/* This helper is used by netem, as it can hold packets in its
2536 * delay queue. We want to allow the owner socket to send more
2537 * packets, as if they were already TX completed by a typical driver.
2538 * But we also want to keep skb->sk set because some packet schedulers
f6ba8d33 2539 * rely on it (sch_fq for example).
1d2077ac 2540 */
f2f872f9
ED
2541void skb_orphan_partial(struct sk_buff *skb)
2542{
f6ba8d33 2543 if (skb_is_tcp_pure_ack(skb))
1d2077ac
ED
2544 return;
2545
098116e7
PA
2546 if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2547 return;
2548
2549 skb_orphan(skb);
f2f872f9
ED
2550}
2551EXPORT_SYMBOL(skb_orphan_partial);
2552
4ec93edb
YH
2553/*
2554 * Read buffer destructor automatically called from kfree_skb.
1da177e4
LT
2555 */
2556void sock_rfree(struct sk_buff *skb)
2557{
2558 struct sock *sk = skb->sk;
d361fd59 2559 unsigned int len = skb->truesize;
1da177e4 2560
d361fd59
ED
2561 atomic_sub(len, &sk->sk_rmem_alloc);
2562 sk_mem_uncharge(sk, len);
1da177e4 2563}
2a91525c 2564EXPORT_SYMBOL(sock_rfree);
1da177e4 2565
7768eed8
OH
2566/*
2567 * Buffer destructor for skbs that are not used directly in read or write
2568 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2569 */
62bccb8c
AD
2570void sock_efree(struct sk_buff *skb)
2571{
2572 sock_put(skb->sk);
2573}
2574EXPORT_SYMBOL(sock_efree);
2575
cf7fbe66
JS
2576/* Buffer destructor for prefetch/receive path where reference count may
2577 * not be held, e.g. for listen sockets.
2578 */
2579#ifdef CONFIG_INET
2580void sock_pfree(struct sk_buff *skb)
2581{
7ae215d2
JS
2582 if (sk_is_refcounted(skb->sk))
2583 sock_gen_put(skb->sk);
cf7fbe66
JS
2584}
2585EXPORT_SYMBOL(sock_pfree);
2586#endif /* CONFIG_INET */
2587
976d0201 2588kuid_t sock_i_uid(struct sock *sk)
1da177e4 2589{
976d0201 2590 kuid_t uid;
1da177e4 2591
f064af1e 2592 read_lock_bh(&sk->sk_callback_lock);
976d0201 2593 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
f064af1e 2594 read_unlock_bh(&sk->sk_callback_lock);
1da177e4
LT
2595 return uid;
2596}
2a91525c 2597EXPORT_SYMBOL(sock_i_uid);
1da177e4 2598
25a9c8a4 2599unsigned long __sock_i_ino(struct sock *sk)
1da177e4
LT
2600{
2601 unsigned long ino;
2602
25a9c8a4 2603 read_lock(&sk->sk_callback_lock);
1da177e4 2604 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
25a9c8a4
KI
2605 read_unlock(&sk->sk_callback_lock);
2606 return ino;
2607}
2608EXPORT_SYMBOL(__sock_i_ino);
2609
2610unsigned long sock_i_ino(struct sock *sk)
2611{
2612 unsigned long ino;
2613
2614 local_bh_disable();
2615 ino = __sock_i_ino(sk);
2616 local_bh_enable();
1da177e4
LT
2617 return ino;
2618}
2a91525c 2619EXPORT_SYMBOL(sock_i_ino);
1da177e4
LT
2620
2621/*
2622 * Allocate a skb from the socket's send buffer.
2623 */
86a76caf 2624struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
dd0fc66f 2625 gfp_t priority)
1da177e4 2626{
e292f05e
ED
2627 if (force ||
2628 refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2a91525c 2629 struct sk_buff *skb = alloc_skb(size, priority);
e292f05e 2630
1da177e4
LT
2631 if (skb) {
2632 skb_set_owner_w(skb, sk);
2633 return skb;
2634 }
2635 }
2636 return NULL;
2637}
2a91525c 2638EXPORT_SYMBOL(sock_wmalloc);
1da177e4 2639
98ba0bd5
WB
2640static void sock_ofree(struct sk_buff *skb)
2641{
2642 struct sock *sk = skb->sk;
2643
2644 atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2645}
2646
2647struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2648 gfp_t priority)
2649{
2650 struct sk_buff *skb;
2651
2652 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2653 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
7de6d09f 2654 READ_ONCE(sysctl_optmem_max))
98ba0bd5
WB
2655 return NULL;
2656
2657 skb = alloc_skb(size, priority);
2658 if (!skb)
2659 return NULL;
2660
2661 atomic_add(skb->truesize, &sk->sk_omem_alloc);
2662 skb->sk = sk;
2663 skb->destructor = sock_ofree;
2664 return skb;
2665}
2666
4ec93edb 2667/*
1da177e4 2668 * Allocate a memory block from the socket's option memory buffer.
4ec93edb 2669 */
dd0fc66f 2670void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1da177e4 2671{
7de6d09f
KI
2672 int optmem_max = READ_ONCE(sysctl_optmem_max);
2673
2674 if ((unsigned int)size <= optmem_max &&
2675 atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
1da177e4
LT
2676 void *mem;
2677 /* First do the add, to avoid the race if kmalloc
4ec93edb 2678 * might sleep.
1da177e4
LT
2679 */
2680 atomic_add(size, &sk->sk_omem_alloc);
2681 mem = kmalloc(size, priority);
2682 if (mem)
2683 return mem;
2684 atomic_sub(size, &sk->sk_omem_alloc);
2685 }
2686 return NULL;
2687}
2a91525c 2688EXPORT_SYMBOL(sock_kmalloc);
1da177e4 2689
79e88659
DB
2690/* Free an option memory block. Note, we actually want the inline
2691 * here as this allows gcc to detect the nullify and fold away the
2692 * condition entirely.
1da177e4 2693 */
79e88659
DB
2694static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2695 const bool nullify)
1da177e4 2696{
e53da5fb
DM
2697 if (WARN_ON_ONCE(!mem))
2698 return;
79e88659 2699 if (nullify)
453431a5 2700 kfree_sensitive(mem);
79e88659
DB
2701 else
2702 kfree(mem);
1da177e4
LT
2703 atomic_sub(size, &sk->sk_omem_alloc);
2704}
79e88659
DB
2705
2706void sock_kfree_s(struct sock *sk, void *mem, int size)
2707{
2708 __sock_kfree_s(sk, mem, size, false);
2709}
2a91525c 2710EXPORT_SYMBOL(sock_kfree_s);
1da177e4 2711
79e88659
DB
2712void sock_kzfree_s(struct sock *sk, void *mem, int size)
2713{
2714 __sock_kfree_s(sk, mem, size, true);
2715}
2716EXPORT_SYMBOL(sock_kzfree_s);
2717
1da177e4
LT
2718/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2719 I think, these locks should be removed for datagram sockets.
2720 */
2a91525c 2721static long sock_wait_for_wmem(struct sock *sk, long timeo)
1da177e4
LT
2722{
2723 DEFINE_WAIT(wait);
2724
9cd3e072 2725 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1da177e4
LT
2726 for (;;) {
2727 if (!timeo)
2728 break;
2729 if (signal_pending(current))
2730 break;
2731 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
aa395145 2732 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
e292f05e 2733 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
1da177e4 2734 break;
afe8764f 2735 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
1da177e4 2736 break;
b1928129 2737 if (READ_ONCE(sk->sk_err))
1da177e4
LT
2738 break;
2739 timeo = schedule_timeout(timeo);
2740 }
aa395145 2741 finish_wait(sk_sleep(sk), &wait);
1da177e4
LT
2742 return timeo;
2743}
2744
2745
2746/*
2747 * Generic send/receive buffer handlers
2748 */
2749
4cc7f68d
HX
2750struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2751 unsigned long data_len, int noblock,
28d64271 2752 int *errcode, int max_page_order)
1da177e4 2753{
2e4e4410 2754 struct sk_buff *skb;
1da177e4
LT
2755 long timeo;
2756 int err;
2757
1da177e4 2758 timeo = sock_sndtimeo(sk, noblock);
2e4e4410 2759 for (;;) {
1da177e4
LT
2760 err = sock_error(sk);
2761 if (err != 0)
2762 goto failure;
2763
2764 err = -EPIPE;
afe8764f 2765 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
1da177e4
LT
2766 goto failure;
2767
e292f05e 2768 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2e4e4410 2769 break;
28d64271 2770
9cd3e072 2771 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2e4e4410
ED
2772 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2773 err = -EAGAIN;
2774 if (!timeo)
1da177e4 2775 goto failure;
2e4e4410
ED
2776 if (signal_pending(current))
2777 goto interrupted;
2778 timeo = sock_wait_for_wmem(sk, timeo);
1da177e4 2779 }
2e4e4410
ED
2780 skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2781 errcode, sk->sk_allocation);
2782 if (skb)
2783 skb_set_owner_w(skb, sk);
1da177e4
LT
2784 return skb;
2785
2786interrupted:
2787 err = sock_intr_errno(timeo);
2788failure:
2789 *errcode = err;
2790 return NULL;
2791}
4cc7f68d 2792EXPORT_SYMBOL(sock_alloc_send_pskb);
1da177e4 2793
233baf9a 2794int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
39771b12
WB
2795 struct sockcm_cookie *sockc)
2796{
3dd17e63
SHY
2797 u32 tsflags;
2798
39771b12
WB
2799 switch (cmsg->cmsg_type) {
2800 case SO_MARK:
91f0d8a4
JK
2801 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2802 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
39771b12
WB
2803 return -EPERM;
2804 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2805 return -EINVAL;
2806 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2807 break;
7f1bc6e9 2808 case SO_TIMESTAMPING_OLD:
3dd17e63
SHY
2809 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2810 return -EINVAL;
2811
2812 tsflags = *(u32 *)CMSG_DATA(cmsg);
2813 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2814 return -EINVAL;
2815
2816 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2817 sockc->tsflags |= tsflags;
2818 break;
80b14dee
RC
2819 case SCM_TXTIME:
2820 if (!sock_flag(sk, SOCK_TXTIME))
2821 return -EINVAL;
2822 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2823 return -EINVAL;
2824 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2825 break;
779f1ede
SHY
2826 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2827 case SCM_RIGHTS:
2828 case SCM_CREDENTIALS:
2829 break;
39771b12
WB
2830 default:
2831 return -EINVAL;
2832 }
2833 return 0;
2834}
2835EXPORT_SYMBOL(__sock_cmsg_send);
2836
f28ea365
EJ
2837int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2838 struct sockcm_cookie *sockc)
2839{
2840 struct cmsghdr *cmsg;
39771b12 2841 int ret;
f28ea365
EJ
2842
2843 for_each_cmsghdr(cmsg, msg) {
2844 if (!CMSG_OK(msg, cmsg))
2845 return -EINVAL;
2846 if (cmsg->cmsg_level != SOL_SOCKET)
2847 continue;
233baf9a 2848 ret = __sock_cmsg_send(sk, cmsg, sockc);
39771b12
WB
2849 if (ret)
2850 return ret;
f28ea365
EJ
2851 }
2852 return 0;
2853}
2854EXPORT_SYMBOL(sock_cmsg_send);
2855
06044751
ED
2856static void sk_enter_memory_pressure(struct sock *sk)
2857{
2858 if (!sk->sk_prot->enter_memory_pressure)
2859 return;
2860
2861 sk->sk_prot->enter_memory_pressure(sk);
2862}
2863
2864static void sk_leave_memory_pressure(struct sock *sk)
2865{
2866 if (sk->sk_prot->leave_memory_pressure) {
5c1ebbfa
BV
2867 INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
2868 tcp_leave_memory_pressure, sk);
06044751
ED
2869 } else {
2870 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2871
503978ac
ED
2872 if (memory_pressure && READ_ONCE(*memory_pressure))
2873 WRITE_ONCE(*memory_pressure, 0);
06044751
ED
2874 }
2875}
2876
ce27ec60 2877DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
5640f768 2878
400dfd3a
ED
2879/**
2880 * skb_page_frag_refill - check that a page_frag contains enough room
2881 * @sz: minimum size of the fragment we want to get
2882 * @pfrag: pointer to page_frag
82d5e2b8 2883 * @gfp: priority for memory allocation
400dfd3a
ED
2884 *
2885 * Note: While this allocator tries to use high order pages, there is
2886 * no guarantee that allocations succeed. Therefore, @sz MUST be
2887 * less or equal than PAGE_SIZE.
2888 */
d9b2938a 2889bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
5640f768 2890{
5640f768 2891 if (pfrag->page) {
fe896d18 2892 if (page_ref_count(pfrag->page) == 1) {
5640f768
ED
2893 pfrag->offset = 0;
2894 return true;
2895 }
400dfd3a 2896 if (pfrag->offset + sz <= pfrag->size)
5640f768
ED
2897 return true;
2898 put_page(pfrag->page);
2899 }
2900
d9b2938a 2901 pfrag->offset = 0;
ce27ec60
ED
2902 if (SKB_FRAG_PAGE_ORDER &&
2903 !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
d0164adc
MG
2904 /* Avoid direct reclaim but allow kswapd to wake */
2905 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2906 __GFP_COMP | __GFP_NOWARN |
2907 __GFP_NORETRY,
d9b2938a 2908 SKB_FRAG_PAGE_ORDER);
5640f768 2909 if (likely(pfrag->page)) {
d9b2938a 2910 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
5640f768
ED
2911 return true;
2912 }
d9b2938a
ED
2913 }
2914 pfrag->page = alloc_page(gfp);
2915 if (likely(pfrag->page)) {
2916 pfrag->size = PAGE_SIZE;
2917 return true;
2918 }
400dfd3a
ED
2919 return false;
2920}
2921EXPORT_SYMBOL(skb_page_frag_refill);
2922
2923bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2924{
2925 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2926 return true;
2927
5640f768
ED
2928 sk_enter_memory_pressure(sk);
2929 sk_stream_moderate_sndbuf(sk);
2930 return false;
2931}
2932EXPORT_SYMBOL(sk_page_frag_refill);
2933
ad80b0fc 2934void __lock_sock(struct sock *sk)
f39234d6
NK
2935 __releases(&sk->sk_lock.slock)
2936 __acquires(&sk->sk_lock.slock)
1da177e4
LT
2937{
2938 DEFINE_WAIT(wait);
2939
e71a4783 2940 for (;;) {
1da177e4
LT
2941 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2942 TASK_UNINTERRUPTIBLE);
2943 spin_unlock_bh(&sk->sk_lock.slock);
2944 schedule();
2945 spin_lock_bh(&sk->sk_lock.slock);
e71a4783 2946 if (!sock_owned_by_user(sk))
1da177e4
LT
2947 break;
2948 }
2949 finish_wait(&sk->sk_lock.wq, &wait);
2950}
2951
8873c064 2952void __release_sock(struct sock *sk)
f39234d6
NK
2953 __releases(&sk->sk_lock.slock)
2954 __acquires(&sk->sk_lock.slock)
1da177e4 2955{
5413d1ba 2956 struct sk_buff *skb, *next;
1da177e4 2957
5413d1ba 2958 while ((skb = sk->sk_backlog.head) != NULL) {
1da177e4 2959 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1da177e4 2960
5413d1ba 2961 spin_unlock_bh(&sk->sk_lock.slock);
1da177e4 2962
5413d1ba
ED
2963 do {
2964 next = skb->next;
e4cbb02a 2965 prefetch(next);
63fbdd3c 2966 DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
a8305bff 2967 skb_mark_not_on_list(skb);
c57943a1 2968 sk_backlog_rcv(sk, skb);
1da177e4 2969
5413d1ba 2970 cond_resched();
1da177e4
LT
2971
2972 skb = next;
2973 } while (skb != NULL);
2974
5413d1ba
ED
2975 spin_lock_bh(&sk->sk_lock.slock);
2976 }
8eae939f
ZY
2977
2978 /*
2979 * Doing the zeroing here guarantee we can not loop forever
2980 * while a wild producer attempts to flood us.
2981 */
2982 sk->sk_backlog.len = 0;
1da177e4
LT
2983}
2984
d41a69f1
ED
2985void __sk_flush_backlog(struct sock *sk)
2986{
2987 spin_lock_bh(&sk->sk_lock.slock);
2988 __release_sock(sk);
4505dc2a
ED
2989
2990 if (sk->sk_prot->release_cb)
41862d12
ED
2991 INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
2992 tcp_release_cb, sk);
2993
d41a69f1
ED
2994 spin_unlock_bh(&sk->sk_lock.slock);
2995}
c46b0183 2996EXPORT_SYMBOL_GPL(__sk_flush_backlog);
d41a69f1 2997
1da177e4
LT
2998/**
2999 * sk_wait_data - wait for data to arrive at sk_receive_queue
4dc3b16b
PP
3000 * @sk: sock to wait on
3001 * @timeo: for how long
dfbafc99 3002 * @skb: last skb seen on sk_receive_queue
1da177e4
LT
3003 *
3004 * Now socket state including sk->sk_err is changed only under lock,
3005 * hence we may omit checks after joining wait queue.
3006 * We check receive queue before schedule() only as optimization;
3007 * it is very likely that release_sock() added new data.
3008 */
dfbafc99 3009int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
1da177e4 3010{
d9dc8b0f 3011 DEFINE_WAIT_FUNC(wait, woken_wake_function);
1da177e4 3012 int rc;
1da177e4 3013
d9dc8b0f 3014 add_wait_queue(sk_sleep(sk), &wait);
9cd3e072 3015 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
d9dc8b0f 3016 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
9cd3e072 3017 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
d9dc8b0f 3018 remove_wait_queue(sk_sleep(sk), &wait);
1da177e4
LT
3019 return rc;
3020}
1da177e4
LT
3021EXPORT_SYMBOL(sk_wait_data);
3022
3ab224be 3023/**
f8c3bf00 3024 * __sk_mem_raise_allocated - increase memory_allocated
3ab224be
HA
3025 * @sk: socket
3026 * @size: memory size to allocate
f8c3bf00 3027 * @amt: pages to allocate
3ab224be
HA
3028 * @kind: allocation type
3029 *
66e6369e
AW
3030 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
3031 *
3032 * Unlike the globally shared limits among the sockets under same protocol,
3033 * consuming the budget of a memcg won't have direct effect on other ones.
3034 * So be optimistic about memcg's tolerance, and leave the callers to decide
3035 * whether or not to raise allocated through sk_under_memory_pressure() or
3036 * its variants.
3ab224be 3037 */
f8c3bf00 3038int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3ab224be 3039{
2def8ff3 3040 struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL;
219160be 3041 struct proto *prot = sk->sk_prot;
2def8ff3 3042 bool charged = false;
219160be 3043 long allocated;
e805605c 3044
219160be
ED
3045 sk_memory_allocated_add(sk, amt);
3046 allocated = sk_memory_allocated(sk);
2def8ff3
AW
3047
3048 if (memcg) {
3049 if (!mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()))
3050 goto suppress_allocation;
3051 charged = true;
3052 }
3ab224be
HA
3053
3054 /* Under limit. */
e805605c 3055 if (allocated <= sk_prot_mem_limits(sk, 0)) {
180d8cd9 3056 sk_leave_memory_pressure(sk);
3ab224be
HA
3057 return 1;
3058 }
3059
e805605c
JW
3060 /* Under pressure. */
3061 if (allocated > sk_prot_mem_limits(sk, 1))
180d8cd9 3062 sk_enter_memory_pressure(sk);
3ab224be 3063
e805605c
JW
3064 /* Over hard limit. */
3065 if (allocated > sk_prot_mem_limits(sk, 2))
3ab224be
HA
3066 goto suppress_allocation;
3067
2e12072c
AW
3068 /* Guarantee minimum buffer size under pressure (either global
3069 * or memcg) to make sure features described in RFC 7323 (TCP
3070 * Extensions for High Performance) work properly.
3071 *
3072 * This rule does NOT stand when exceeds global or memcg's hard
3073 * limit, or else a DoS attack can be taken place by spawning
3074 * lots of sockets whose usage are under minimum buffer size.
3075 */
3ab224be 3076 if (kind == SK_MEM_RECV) {
a3dcaf17 3077 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3ab224be 3078 return 1;
180d8cd9 3079
3ab224be 3080 } else { /* SK_MEM_SEND */
a3dcaf17
ED
3081 int wmem0 = sk_get_wmem0(sk, prot);
3082
3ab224be 3083 if (sk->sk_type == SOCK_STREAM) {
a3dcaf17 3084 if (sk->sk_wmem_queued < wmem0)
3ab224be 3085 return 1;
a3dcaf17 3086 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3ab224be 3087 return 1;
a3dcaf17 3088 }
3ab224be
HA
3089 }
3090
180d8cd9 3091 if (sk_has_memory_pressure(sk)) {
5bf325a5 3092 u64 alloc;
1748376b 3093
66e6369e
AW
3094 /* The following 'average' heuristic is within the
3095 * scope of global accounting, so it only makes
3096 * sense for global memory pressure.
3097 */
3098 if (!sk_under_global_memory_pressure(sk))
1748376b 3099 return 1;
2e12072c
AW
3100
3101 /* Try to be fair among all the sockets under global
3102 * pressure by allowing the ones that below average
3103 * usage to raise.
3104 */
180d8cd9
GC
3105 alloc = sk_sockets_allocated_read_positive(sk);
3106 if (sk_prot_mem_limits(sk, 2) > alloc *
3ab224be
HA
3107 sk_mem_pages(sk->sk_wmem_queued +
3108 atomic_read(&sk->sk_rmem_alloc) +
3109 sk->sk_forward_alloc))
3110 return 1;
3111 }
3112
3113suppress_allocation:
3114
3115 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3116 sk_stream_moderate_sndbuf(sk);
3117
3118 /* Fail only if socket is _under_ its sndbuf.
3119 * In this case we cannot block, so that we have to fail.
3120 */
4b1327be
WW
3121 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3122 /* Force charge with __GFP_NOFAIL */
2def8ff3
AW
3123 if (memcg && !charged) {
3124 mem_cgroup_charge_skmem(memcg, amt,
4b1327be
WW
3125 gfp_memcg_charge() | __GFP_NOFAIL);
3126 }
3ab224be 3127 return 1;
4b1327be 3128 }
3ab224be
HA
3129 }
3130
d6f19938
YS
3131 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
3132 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3847ce32 3133
0e90b31f 3134 sk_memory_allocated_sub(sk, amt);
180d8cd9 3135
2def8ff3
AW
3136 if (charged)
3137 mem_cgroup_uncharge_skmem(memcg, amt);
e805605c 3138
3ab224be
HA
3139 return 0;
3140}
f8c3bf00
PA
3141
3142/**
3143 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3144 * @sk: socket
3145 * @size: memory size to allocate
3146 * @kind: allocation type
3147 *
3148 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3149 * rmem allocation. This function assumes that protocols which have
3150 * memory_pressure use sk_wmem_queued as write buffer accounting.
3151 */
3152int __sk_mem_schedule(struct sock *sk, int size, int kind)
3153{
3154 int ret, amt = sk_mem_pages(size);
3155
5e6300e7 3156 sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
f8c3bf00
PA
3157 ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3158 if (!ret)
5e6300e7 3159 sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
f8c3bf00
PA
3160 return ret;
3161}
3ab224be
HA
3162EXPORT_SYMBOL(__sk_mem_schedule);
3163
3164/**
f8c3bf00 3165 * __sk_mem_reduce_allocated - reclaim memory_allocated
3ab224be 3166 * @sk: socket
f8c3bf00
PA
3167 * @amount: number of quanta
3168 *
3169 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3ab224be 3170 */
f8c3bf00 3171void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3ab224be 3172{
1a24e04e 3173 sk_memory_allocated_sub(sk, amount);
3ab224be 3174
baac50bb
JW
3175 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3176 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
e805605c 3177
2d0c88e8 3178 if (sk_under_global_memory_pressure(sk) &&
180d8cd9
GC
3179 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3180 sk_leave_memory_pressure(sk);
3ab224be 3181}
f8c3bf00
PA
3182
3183/**
3184 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3185 * @sk: socket
100fdd1f 3186 * @amount: number of bytes (rounded down to a PAGE_SIZE multiple)
f8c3bf00
PA
3187 */
3188void __sk_mem_reclaim(struct sock *sk, int amount)
3189{
100fdd1f 3190 amount >>= PAGE_SHIFT;
5e6300e7 3191 sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
f8c3bf00
PA
3192 __sk_mem_reduce_allocated(sk, amount);
3193}
3ab224be
HA
3194EXPORT_SYMBOL(__sk_mem_reclaim);
3195
627d2d6b 3196int sk_set_peek_off(struct sock *sk, int val)
3197{
11695c6e 3198 WRITE_ONCE(sk->sk_peek_off, val);
627d2d6b 3199 return 0;
3200}
3201EXPORT_SYMBOL_GPL(sk_set_peek_off);
3ab224be 3202
1da177e4
LT
3203/*
3204 * Set of default routines for initialising struct proto_ops when
3205 * the protocol does not support a particular function. In certain
3206 * cases where it makes no sense for a protocol to have a "do nothing"
3207 * function, some default processing is provided.
3208 */
3209
3210int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3211{
3212 return -EOPNOTSUPP;
3213}
2a91525c 3214EXPORT_SYMBOL(sock_no_bind);
1da177e4 3215
4ec93edb 3216int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1da177e4
LT
3217 int len, int flags)
3218{
3219 return -EOPNOTSUPP;
3220}
2a91525c 3221EXPORT_SYMBOL(sock_no_connect);
1da177e4
LT
3222
3223int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3224{
3225 return -EOPNOTSUPP;
3226}
2a91525c 3227EXPORT_SYMBOL(sock_no_socketpair);
1da177e4 3228
cdfbabfb
DH
3229int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
3230 bool kern)
1da177e4
LT
3231{
3232 return -EOPNOTSUPP;
3233}
2a91525c 3234EXPORT_SYMBOL(sock_no_accept);
1da177e4 3235
4ec93edb 3236int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
9b2c45d4 3237 int peer)
1da177e4
LT
3238{
3239 return -EOPNOTSUPP;
3240}
2a91525c 3241EXPORT_SYMBOL(sock_no_getname);
1da177e4 3242
1da177e4
LT
3243int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3244{
3245 return -EOPNOTSUPP;
3246}
2a91525c 3247EXPORT_SYMBOL(sock_no_ioctl);
1da177e4
LT
3248
3249int sock_no_listen(struct socket *sock, int backlog)
3250{
3251 return -EOPNOTSUPP;
3252}
2a91525c 3253EXPORT_SYMBOL(sock_no_listen);
1da177e4
LT
3254
3255int sock_no_shutdown(struct socket *sock, int how)
3256{
3257 return -EOPNOTSUPP;
3258}
2a91525c 3259EXPORT_SYMBOL(sock_no_shutdown);
1da177e4 3260
1b784140 3261int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
1da177e4
LT
3262{
3263 return -EOPNOTSUPP;
3264}
2a91525c 3265EXPORT_SYMBOL(sock_no_sendmsg);
1da177e4 3266
306b13eb
TH
3267int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3268{
3269 return -EOPNOTSUPP;
3270}
3271EXPORT_SYMBOL(sock_no_sendmsg_locked);
3272
1b784140
YX
3273int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3274 int flags)
1da177e4
LT
3275{
3276 return -EOPNOTSUPP;
3277}
2a91525c 3278EXPORT_SYMBOL(sock_no_recvmsg);
1da177e4
LT
3279
3280int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3281{
3282 /* Mirror missing mmap method error code */
3283 return -ENODEV;
3284}
2a91525c 3285EXPORT_SYMBOL(sock_no_mmap);
1da177e4 3286
d9539752
KC
3287/*
3288 * When a file is received (via SCM_RIGHTS, etc), we must bump the
3289 * various sock-based usage counts.
3290 */
3291void __receive_sock(struct file *file)
3292{
3293 struct socket *sock;
d9539752 3294
dba4a925 3295 sock = sock_from_file(file);
d9539752
KC
3296 if (sock) {
3297 sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3298 sock_update_classid(&sock->sk->sk_cgrp_data);
3299 }
3300}
3301
1da177e4
LT
3302/*
3303 * Default Socket Callbacks
3304 */
3305
3306static void sock_def_wakeup(struct sock *sk)
3307{
43815482
ED
3308 struct socket_wq *wq;
3309
3310 rcu_read_lock();
3311 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 3312 if (skwq_has_sleeper(wq))
43815482
ED
3313 wake_up_interruptible_all(&wq->wait);
3314 rcu_read_unlock();
1da177e4
LT
3315}
3316
3317static void sock_def_error_report(struct sock *sk)
3318{
43815482
ED
3319 struct socket_wq *wq;
3320
3321 rcu_read_lock();
3322 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 3323 if (skwq_has_sleeper(wq))
a9a08845 3324 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
8d8ad9d7 3325 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
43815482 3326 rcu_read_unlock();
1da177e4
LT
3327}
3328
43a825af 3329void sock_def_readable(struct sock *sk)
1da177e4 3330{
43815482
ED
3331 struct socket_wq *wq;
3332
40e0b090
PY
3333 trace_sk_data_ready(sk);
3334
43815482
ED
3335 rcu_read_lock();
3336 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 3337 if (skwq_has_sleeper(wq))
a9a08845
LT
3338 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3339 EPOLLRDNORM | EPOLLRDBAND);
8d8ad9d7 3340 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
43815482 3341 rcu_read_unlock();
1da177e4
LT
3342}
3343
3344static void sock_def_write_space(struct sock *sk)
3345{
43815482
ED
3346 struct socket_wq *wq;
3347
3348 rcu_read_lock();
1da177e4
LT
3349
3350 /* Do not wake up a writer until he can make "significant"
3351 * progress. --DaveM
3352 */
14bfee9b 3353 if (sock_writeable(sk)) {
43815482 3354 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 3355 if (skwq_has_sleeper(wq))
a9a08845
LT
3356 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3357 EPOLLWRNORM | EPOLLWRBAND);
1da177e4
LT
3358
3359 /* Should agree with poll, otherwise some programs break */
14bfee9b 3360 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1da177e4
LT
3361 }
3362
43815482 3363 rcu_read_unlock();
1da177e4
LT
3364}
3365
0a8afd9f
PB
3366/* An optimised version of sock_def_write_space(), should only be called
3367 * for SOCK_RCU_FREE sockets under RCU read section and after putting
3368 * ->sk_wmem_alloc.
3369 */
3370static void sock_def_write_space_wfree(struct sock *sk)
3371{
3372 /* Do not wake up a writer until he can make "significant"
3373 * progress. --DaveM
3374 */
3375 if (sock_writeable(sk)) {
3376 struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3377
3378 /* rely on refcount_sub from sock_wfree() */
3379 smp_mb__after_atomic();
3380 if (wq && waitqueue_active(&wq->wait))
3381 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3382 EPOLLWRNORM | EPOLLWRBAND);
3383
3384 /* Should agree with poll, otherwise some programs break */
3385 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3386 }
3387}
3388
1da177e4
LT
3389static void sock_def_destruct(struct sock *sk)
3390{
1da177e4
LT
3391}
3392
3393void sk_send_sigurg(struct sock *sk)
3394{
3395 if (sk->sk_socket && sk->sk_socket->file)
3396 if (send_sigurg(&sk->sk_socket->file->f_owner))
8d8ad9d7 3397 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1da177e4 3398}
2a91525c 3399EXPORT_SYMBOL(sk_send_sigurg);
1da177e4
LT
3400
3401void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3402 unsigned long expires)
3403{
3404 if (!mod_timer(timer, expires))
3405 sock_hold(sk);
3406}
1da177e4
LT
3407EXPORT_SYMBOL(sk_reset_timer);
3408
3409void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3410{
25cc4ae9 3411 if (del_timer(timer))
1da177e4
LT
3412 __sock_put(sk);
3413}
1da177e4
LT
3414EXPORT_SYMBOL(sk_stop_timer);
3415
08b81d87
GT
3416void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3417{
3418 if (del_timer_sync(timer))
3419 __sock_put(sk);
3420}
3421EXPORT_SYMBOL(sk_stop_timer_sync);
3422
584f3742 3423void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
1da177e4 3424{
581319c5 3425 sk_init_common(sk);
1da177e4
LT
3426 sk->sk_send_head = NULL;
3427
99767f27 3428 timer_setup(&sk->sk_timer, NULL, 0);
4ec93edb 3429
1da177e4 3430 sk->sk_allocation = GFP_KERNEL;
1227c177
KI
3431 sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default);
3432 sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default);
1da177e4 3433 sk->sk_state = TCP_CLOSE;
fb87bd47 3434 sk->sk_use_task_frag = true;
972692e0 3435 sk_set_socket(sk, sock);
1da177e4
LT
3436
3437 sock_set_flag(sk, SOCK_ZAPPED);
3438
e71a4783 3439 if (sock) {
1da177e4 3440 sk->sk_type = sock->type;
333f7909 3441 RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
1da177e4 3442 sock->sk = sk;
86741ec2 3443 } else {
c2f26e8f 3444 RCU_INIT_POINTER(sk->sk_wq, NULL);
86741ec2 3445 }
584f3742 3446 sk->sk_uid = uid;
1da177e4 3447
1da177e4 3448 rwlock_init(&sk->sk_callback_lock);
cdfbabfb
DH
3449 if (sk->sk_kern_sock)
3450 lockdep_set_class_and_name(
3451 &sk->sk_callback_lock,
3452 af_kern_callback_keys + sk->sk_family,
3453 af_family_kern_clock_key_strings[sk->sk_family]);
3454 else
3455 lockdep_set_class_and_name(
3456 &sk->sk_callback_lock,
443aef0e
PZ
3457 af_callback_keys + sk->sk_family,
3458 af_family_clock_key_strings[sk->sk_family]);
1da177e4
LT
3459
3460 sk->sk_state_change = sock_def_wakeup;
3461 sk->sk_data_ready = sock_def_readable;
3462 sk->sk_write_space = sock_def_write_space;
3463 sk->sk_error_report = sock_def_error_report;
3464 sk->sk_destruct = sock_def_destruct;
3465
5640f768
ED
3466 sk->sk_frag.page = NULL;
3467 sk->sk_frag.offset = 0;
ef64a54f 3468 sk->sk_peek_off = -1;
1da177e4 3469
109f6e39
EB
3470 sk->sk_peer_pid = NULL;
3471 sk->sk_peer_cred = NULL;
35306eb2
ED
3472 spin_lock_init(&sk->sk_peer_lock);
3473
1da177e4
LT
3474 sk->sk_write_pending = 0;
3475 sk->sk_rcvlowat = 1;
3476 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
3477 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
3478
6c7c98ba 3479 sk->sk_stamp = SK_DEFAULT_STAMP;
3a0ed3e9
DD
3480#if BITS_PER_LONG==32
3481 seqlock_init(&sk->sk_stamp_seq);
3482#endif
52267790 3483 atomic_set(&sk->sk_zckey, 0);
1da177e4 3484
e0d1095a 3485#ifdef CONFIG_NET_RX_BUSY_POLL
06021292 3486 sk->sk_napi_id = 0;
e59ef36f 3487 sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read);
06021292
ET
3488#endif
3489
76a9ebe8
ED
3490 sk->sk_max_pacing_rate = ~0UL;
3491 sk->sk_pacing_rate = ~0UL;
7c68fa2b 3492 WRITE_ONCE(sk->sk_pacing_shift, 10);
70da268b 3493 sk->sk_incoming_cpu = -1;
c6345ce7
AN
3494
3495 sk_rx_queue_clear(sk);
4dc6dc71
ED
3496 /*
3497 * Before updating sk_refcnt, we must commit prior changes to memory
2cdb54c9 3498 * (Documentation/RCU/rculist_nulls.rst for details)
4dc6dc71
ED
3499 */
3500 smp_wmb();
41c6d650 3501 refcount_set(&sk->sk_refcnt, 1);
33c732c3 3502 atomic_set(&sk->sk_drops, 0);
1da177e4 3503}
584f3742
PB
3504EXPORT_SYMBOL(sock_init_data_uid);
3505
3506void sock_init_data(struct socket *sock, struct sock *sk)
3507{
3508 kuid_t uid = sock ?
3509 SOCK_INODE(sock)->i_uid :
3510 make_kuid(sock_net(sk)->user_ns, 0);
3511
3512 sock_init_data_uid(sock, sk, uid);
3513}
2a91525c 3514EXPORT_SYMBOL(sock_init_data);
1da177e4 3515
b5606c2d 3516void lock_sock_nested(struct sock *sk, int subclass)
1da177e4 3517{
2dcb96ba
TG
3518 /* The sk_lock has mutex_lock() semantics here. */
3519 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3520
1da177e4 3521 might_sleep();
a5b5bb9a 3522 spin_lock_bh(&sk->sk_lock.slock);
33d60fbd 3523 if (sock_owned_by_user_nocheck(sk))
1da177e4 3524 __lock_sock(sk);
d2e9117c 3525 sk->sk_lock.owned = 1;
2dcb96ba 3526 spin_unlock_bh(&sk->sk_lock.slock);
1da177e4 3527}
fcc70d5f 3528EXPORT_SYMBOL(lock_sock_nested);
1da177e4 3529
b5606c2d 3530void release_sock(struct sock *sk)
1da177e4 3531{
a5b5bb9a 3532 spin_lock_bh(&sk->sk_lock.slock);
1da177e4
LT
3533 if (sk->sk_backlog.tail)
3534 __release_sock(sk);
46d3ceab
ED
3535
3536 if (sk->sk_prot->release_cb)
41862d12
ED
3537 INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3538 tcp_release_cb, sk);
46d3ceab 3539
c3f9b018 3540 sock_release_ownership(sk);
a5b5bb9a
IM
3541 if (waitqueue_active(&sk->sk_lock.wq))
3542 wake_up(&sk->sk_lock.wq);
3543 spin_unlock_bh(&sk->sk_lock.slock);
1da177e4
LT
3544}
3545EXPORT_SYMBOL(release_sock);
3546
49054556 3547bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
8a74ad60
ED
3548{
3549 might_sleep();
3550 spin_lock_bh(&sk->sk_lock.slock);
3551
33d60fbd 3552 if (!sock_owned_by_user_nocheck(sk)) {
8a74ad60 3553 /*
2dcb96ba
TG
3554 * Fast path return with bottom halves disabled and
3555 * sock::sk_lock.slock held.
3556 *
3557 * The 'mutex' is not contended and holding
3558 * sock::sk_lock.slock prevents all other lockers to
3559 * proceed so the corresponding unlock_sock_fast() can
3560 * avoid the slow path of release_sock() completely and
3561 * just release slock.
3562 *
3563 * From a semantical POV this is equivalent to 'acquiring'
3564 * the 'mutex', hence the corresponding lockdep
3565 * mutex_release() has to happen in the fast path of
3566 * unlock_sock_fast().
8a74ad60
ED
3567 */
3568 return false;
2dcb96ba 3569 }
8a74ad60
ED
3570
3571 __lock_sock(sk);
3572 sk->sk_lock.owned = 1;
12f4bd86 3573 __acquire(&sk->sk_lock.slock);
2dcb96ba 3574 spin_unlock_bh(&sk->sk_lock.slock);
8a74ad60
ED
3575 return true;
3576}
49054556 3577EXPORT_SYMBOL(__lock_sock_fast);
8a74ad60 3578
c7cbdbf2
AB
3579int sock_gettstamp(struct socket *sock, void __user *userstamp,
3580 bool timeval, bool time32)
4ec93edb 3581{
c7cbdbf2
AB
3582 struct sock *sk = sock->sk;
3583 struct timespec64 ts;
9dae3497
YS
3584
3585 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
c7cbdbf2
AB
3586 ts = ktime_to_timespec64(sock_read_timestamp(sk));
3587 if (ts.tv_sec == -1)
1da177e4 3588 return -ENOENT;
c7cbdbf2 3589 if (ts.tv_sec == 0) {
3a0ed3e9 3590 ktime_t kt = ktime_get_real();
f95f96a4 3591 sock_write_timestamp(sk, kt);
c7cbdbf2 3592 ts = ktime_to_timespec64(kt);
b7aa0bf7 3593 }
1da177e4 3594
c7cbdbf2
AB
3595 if (timeval)
3596 ts.tv_nsec /= 1000;
9dae3497 3597
c7cbdbf2
AB
3598#ifdef CONFIG_COMPAT_32BIT_TIME
3599 if (time32)
3600 return put_old_timespec32(&ts, userstamp);
3601#endif
3602#ifdef CONFIG_SPARC64
3603 /* beware of padding in sparc64 timeval */
3604 if (timeval && !in_compat_syscall()) {
3605 struct __kernel_old_timeval __user tv = {
c98f4822
SR
3606 .tv_sec = ts.tv_sec,
3607 .tv_usec = ts.tv_nsec,
c7cbdbf2 3608 };
c98f4822 3609 if (copy_to_user(userstamp, &tv, sizeof(tv)))
c7cbdbf2
AB
3610 return -EFAULT;
3611 return 0;
ae40eb1e 3612 }
c7cbdbf2
AB
3613#endif
3614 return put_timespec64(&ts, userstamp);
ae40eb1e 3615}
c7cbdbf2 3616EXPORT_SYMBOL(sock_gettstamp);
ae40eb1e 3617
193d357d 3618void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
4ec93edb 3619{
20d49473 3620 if (!sock_flag(sk, flag)) {
08e29af3
ED
3621 unsigned long previous_flags = sk->sk_flags;
3622
20d49473
PO
3623 sock_set_flag(sk, flag);
3624 /*
3625 * we just set one of the two flags which require net
3626 * time stamping, but time stamping might have been on
3627 * already because of the other one
3628 */
080a270f
HFS
3629 if (sock_needs_netstamp(sk) &&
3630 !(previous_flags & SK_FLAGS_TIMESTAMP))
20d49473 3631 net_enable_timestamp();
1da177e4
LT
3632 }
3633}
1da177e4 3634
cb820f8e
RC
3635int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3636 int level, int type)
3637{
3638 struct sock_exterr_skb *serr;
364a9e93 3639 struct sk_buff *skb;
cb820f8e
RC
3640 int copied, err;
3641
3642 err = -EAGAIN;
364a9e93 3643 skb = sock_dequeue_err_skb(sk);
cb820f8e
RC
3644 if (skb == NULL)
3645 goto out;
3646
3647 copied = skb->len;
3648 if (copied > len) {
3649 msg->msg_flags |= MSG_TRUNC;
3650 copied = len;
3651 }
51f3d02b 3652 err = skb_copy_datagram_msg(skb, 0, msg, copied);
cb820f8e
RC
3653 if (err)
3654 goto out_free_skb;
3655
3656 sock_recv_timestamp(msg, sk, skb);
3657
3658 serr = SKB_EXT_ERR(skb);
3659 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3660
3661 msg->msg_flags |= MSG_ERRQUEUE;
3662 err = copied;
3663
cb820f8e
RC
3664out_free_skb:
3665 kfree_skb(skb);
3666out:
3667 return err;
3668}
3669EXPORT_SYMBOL(sock_recv_errqueue);
3670
1da177e4
LT
3671/*
3672 * Get a socket option on an socket.
3673 *
3674 * FIX: POSIX 1003.1g is very ambiguous here. It states that
3675 * asynchronous errors should be reported by getsockopt. We assume
3676 * this means if you specify SO_ERROR (otherwise whats the point of it).
3677 */
3678int sock_common_getsockopt(struct socket *sock, int level, int optname,
3679 char __user *optval, int __user *optlen)
3680{
3681 struct sock *sk = sock->sk;
3682
364f997b
KI
3683 /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3684 return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
1da177e4 3685}
1da177e4
LT
3686EXPORT_SYMBOL(sock_common_getsockopt);
3687
1b784140
YX
3688int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3689 int flags)
1da177e4
LT
3690{
3691 struct sock *sk = sock->sk;
3692 int addr_len = 0;
3693 int err;
3694
ec095263 3695 err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
1da177e4
LT
3696 if (err >= 0)
3697 msg->msg_namelen = addr_len;
3698 return err;
3699}
1da177e4
LT
3700EXPORT_SYMBOL(sock_common_recvmsg);
3701
3702/*
3703 * Set socket options on an inet socket.
3704 */
3705int sock_common_setsockopt(struct socket *sock, int level, int optname,
a7b75c5a 3706 sockptr_t optval, unsigned int optlen)
1da177e4
LT
3707{
3708 struct sock *sk = sock->sk;
3709
364f997b
KI
3710 /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3711 return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
1da177e4 3712}
1da177e4
LT
3713EXPORT_SYMBOL(sock_common_setsockopt);
3714
3715void sk_common_release(struct sock *sk)
3716{
3717 if (sk->sk_prot->destroy)
3718 sk->sk_prot->destroy(sk);
3719
3720 /*
645f0897 3721 * Observation: when sk_common_release is called, processes have
1da177e4
LT
3722 * no access to socket. But net still has.
3723 * Step one, detach it from networking:
3724 *
3725 * A. Remove from hash tables.
3726 */
3727
3728 sk->sk_prot->unhash(sk);
3729
3730 /*
3731 * In this point socket cannot receive new packets, but it is possible
3732 * that some packets are in flight because some CPU runs receiver and
3733 * did hash table lookup before we unhashed socket. They will achieve
3734 * receive queue and will be purged by socket destructor.
3735 *
3736 * Also we still have packets pending on receive queue and probably,
3737 * our own packets waiting in device queues. sock_destroy will drain
3738 * receive queue, but transmitted packets will delay socket destruction
3739 * until the last reference will be released.
3740 */
3741
3742 sock_orphan(sk);
3743
3744 xfrm_sk_free_policy(sk);
3745
1da177e4
LT
3746 sock_put(sk);
3747}
1da177e4
LT
3748EXPORT_SYMBOL(sk_common_release);
3749
a2d133b1
JH
3750void sk_get_meminfo(const struct sock *sk, u32 *mem)
3751{
3752 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3753
3754 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
ebb3b78d 3755 mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
a2d133b1 3756 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
e292f05e 3757 mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
66d58f04 3758 mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk);
ab4e846a 3759 mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
a2d133b1 3760 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
70c26558 3761 mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
a2d133b1
JH
3762 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3763}
3764
13ff3d6f 3765#ifdef CONFIG_PROC_FS
13ff3d6f 3766static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
70ee1159 3767
70ee1159
PE
3768int sock_prot_inuse_get(struct net *net, struct proto *prot)
3769{
3770 int cpu, idx = prot->inuse_idx;
3771 int res = 0;
3772
3773 for_each_possible_cpu(cpu)
08fc7f81 3774 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
70ee1159
PE
3775
3776 return res >= 0 ? res : 0;
3777}
3778EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3779
648845ab
TZ
3780int sock_inuse_get(struct net *net)
3781{
3782 int cpu, res = 0;
3783
3784 for_each_possible_cpu(cpu)
4199bae1 3785 res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
648845ab
TZ
3786
3787 return res;
3788}
3789
3790EXPORT_SYMBOL_GPL(sock_inuse_get);
3791
2c8c1e72 3792static int __net_init sock_inuse_init_net(struct net *net)
70ee1159 3793{
08fc7f81 3794 net->core.prot_inuse = alloc_percpu(struct prot_inuse);
648845ab
TZ
3795 if (net->core.prot_inuse == NULL)
3796 return -ENOMEM;
648845ab 3797 return 0;
70ee1159
PE
3798}
3799
2c8c1e72 3800static void __net_exit sock_inuse_exit_net(struct net *net)
70ee1159 3801{
08fc7f81 3802 free_percpu(net->core.prot_inuse);
70ee1159
PE
3803}
3804
3805static struct pernet_operations net_inuse_ops = {
3806 .init = sock_inuse_init_net,
3807 .exit = sock_inuse_exit_net,
3808};
3809
3810static __init int net_inuse_init(void)
3811{
3812 if (register_pernet_subsys(&net_inuse_ops))
3813 panic("Cannot initialize net inuse counters");
3814
3815 return 0;
3816}
3817
3818core_initcall(net_inuse_init);
13ff3d6f 3819
b45ce321 3820static int assign_proto_idx(struct proto *prot)
13ff3d6f
PE
3821{
3822 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3823
3824 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
e005d193 3825 pr_err("PROTO_INUSE_NR exhausted\n");
b45ce321 3826 return -ENOSPC;
13ff3d6f
PE
3827 }
3828
3829 set_bit(prot->inuse_idx, proto_inuse_idx);
b45ce321 3830 return 0;
13ff3d6f
PE
3831}
3832
3833static void release_proto_idx(struct proto *prot)
3834{
3835 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3836 clear_bit(prot->inuse_idx, proto_inuse_idx);
3837}
3838#else
b45ce321 3839static inline int assign_proto_idx(struct proto *prot)
13ff3d6f 3840{
b45ce321 3841 return 0;
13ff3d6f
PE
3842}
3843
3844static inline void release_proto_idx(struct proto *prot)
3845{
3846}
648845ab 3847
13ff3d6f
PE
3848#endif
3849
0f5907af
ML
3850static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3851{
3852 if (!twsk_prot)
3853 return;
3854 kfree(twsk_prot->twsk_slab_name);
3855 twsk_prot->twsk_slab_name = NULL;
3856 kmem_cache_destroy(twsk_prot->twsk_slab);
3857 twsk_prot->twsk_slab = NULL;
3858}
3859
b80350f3
TZ
3860static int tw_prot_init(const struct proto *prot)
3861{
3862 struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3863
3864 if (!twsk_prot)
3865 return 0;
3866
3867 twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3868 prot->name);
3869 if (!twsk_prot->twsk_slab_name)
3870 return -ENOMEM;
3871
3872 twsk_prot->twsk_slab =
3873 kmem_cache_create(twsk_prot->twsk_slab_name,
3874 twsk_prot->twsk_obj_size, 0,
3875 SLAB_ACCOUNT | prot->slab_flags,
3876 NULL);
3877 if (!twsk_prot->twsk_slab) {
3878 pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3879 prot->name);
3880 return -ENOMEM;
3881 }
3882
3883 return 0;
3884}
3885
0159dfd3
ED
3886static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3887{
3888 if (!rsk_prot)
3889 return;
3890 kfree(rsk_prot->slab_name);
3891 rsk_prot->slab_name = NULL;
adf78eda
JL
3892 kmem_cache_destroy(rsk_prot->slab);
3893 rsk_prot->slab = NULL;
0159dfd3
ED
3894}
3895
3896static int req_prot_init(const struct proto *prot)
3897{
3898 struct request_sock_ops *rsk_prot = prot->rsk_prot;
3899
3900 if (!rsk_prot)
3901 return 0;
3902
3903 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3904 prot->name);
3905 if (!rsk_prot->slab_name)
3906 return -ENOMEM;
3907
3908 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3909 rsk_prot->obj_size, 0,
e699e2c6
SB
3910 SLAB_ACCOUNT | prot->slab_flags,
3911 NULL);
0159dfd3
ED
3912
3913 if (!rsk_prot->slab) {
3914 pr_crit("%s: Can't create request sock SLAB cache!\n",
3915 prot->name);
3916 return -ENOMEM;
3917 }
3918 return 0;
3919}
3920
b733c007
PE
3921int proto_register(struct proto *prot, int alloc_slab)
3922{
b45ce321 3923 int ret = -ENOBUFS;
3924
f20cfd66
ED
3925 if (prot->memory_allocated && !prot->sysctl_mem) {
3926 pr_err("%s: missing sysctl_mem\n", prot->name);
3927 return -EINVAL;
3928 }
0defbb0a
ED
3929 if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
3930 pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
3931 return -EINVAL;
3932 }
1da177e4 3933 if (alloc_slab) {
30c2c9f1
DW
3934 prot->slab = kmem_cache_create_usercopy(prot->name,
3935 prot->obj_size, 0,
e699e2c6
SB
3936 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3937 prot->slab_flags,
289a4860 3938 prot->useroffset, prot->usersize,
271b72c7 3939 NULL);
1da177e4
LT
3940
3941 if (prot->slab == NULL) {
e005d193
JP
3942 pr_crit("%s: Can't create sock SLAB cache!\n",
3943 prot->name);
60e7663d 3944 goto out;
1da177e4 3945 }
2e6599cb 3946
0159dfd3
ED
3947 if (req_prot_init(prot))
3948 goto out_free_request_sock_slab;
8feaf0c0 3949
b80350f3
TZ
3950 if (tw_prot_init(prot))
3951 goto out_free_timewait_sock_slab;
1da177e4
LT
3952 }
3953
36b77a52 3954 mutex_lock(&proto_list_mutex);
b45ce321 3955 ret = assign_proto_idx(prot);
3956 if (ret) {
3957 mutex_unlock(&proto_list_mutex);
0f5907af 3958 goto out_free_timewait_sock_slab;
b45ce321 3959 }
1da177e4 3960 list_add(&prot->node, &proto_list);
36b77a52 3961 mutex_unlock(&proto_list_mutex);
b45ce321 3962 return ret;
b733c007 3963
0f5907af 3964out_free_timewait_sock_slab:
ed744d81 3965 if (alloc_slab)
0f5907af 3966 tw_prot_cleanup(prot->twsk_prot);
8feaf0c0 3967out_free_request_sock_slab:
b45ce321 3968 if (alloc_slab) {
3969 req_prot_cleanup(prot->rsk_prot);
0159dfd3 3970
b45ce321 3971 kmem_cache_destroy(prot->slab);
3972 prot->slab = NULL;
3973 }
b733c007 3974out:
b45ce321 3975 return ret;
1da177e4 3976}
1da177e4
LT
3977EXPORT_SYMBOL(proto_register);
3978
3979void proto_unregister(struct proto *prot)
3980{
36b77a52 3981 mutex_lock(&proto_list_mutex);
13ff3d6f 3982 release_proto_idx(prot);
0a3f4358 3983 list_del(&prot->node);
36b77a52 3984 mutex_unlock(&proto_list_mutex);
1da177e4 3985
adf78eda
JL
3986 kmem_cache_destroy(prot->slab);
3987 prot->slab = NULL;
1da177e4 3988
0159dfd3 3989 req_prot_cleanup(prot->rsk_prot);
0f5907af 3990 tw_prot_cleanup(prot->twsk_prot);
1da177e4 3991}
1da177e4
LT
3992EXPORT_SYMBOL(proto_unregister);
3993
bf2ae2e4
XL
3994int sock_load_diag_module(int family, int protocol)
3995{
3996 if (!protocol) {
3997 if (!sock_is_registered(family))
3998 return -ENOENT;
3999
4000 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
4001 NETLINK_SOCK_DIAG, family);
4002 }
4003
4004#ifdef CONFIG_INET
4005 if (family == AF_INET &&
c34c1287 4006 protocol != IPPROTO_RAW &&
3f935c75 4007 protocol < MAX_INET_PROTOS &&
bf2ae2e4
XL
4008 !rcu_access_pointer(inet_protos[protocol]))
4009 return -ENOENT;
4010#endif
4011
4012 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4013 NETLINK_SOCK_DIAG, family, protocol);
4014}
4015EXPORT_SYMBOL(sock_load_diag_module);
4016
1da177e4 4017#ifdef CONFIG_PROC_FS
1da177e4 4018static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
36b77a52 4019 __acquires(proto_list_mutex)
1da177e4 4020{
36b77a52 4021 mutex_lock(&proto_list_mutex);
60f0438a 4022 return seq_list_start_head(&proto_list, *pos);
1da177e4
LT
4023}
4024
4025static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4026{
60f0438a 4027 return seq_list_next(v, &proto_list, pos);
1da177e4
LT
4028}
4029
4030static void proto_seq_stop(struct seq_file *seq, void *v)
36b77a52 4031 __releases(proto_list_mutex)
1da177e4 4032{
36b77a52 4033 mutex_unlock(&proto_list_mutex);
1da177e4
LT
4034}
4035
4036static char proto_method_implemented(const void *method)
4037{
4038 return method == NULL ? 'n' : 'y';
4039}
180d8cd9
GC
4040static long sock_prot_memory_allocated(struct proto *proto)
4041{
cb75a36c 4042 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
180d8cd9
GC
4043}
4044
7a512eb8 4045static const char *sock_prot_memory_pressure(struct proto *proto)
180d8cd9
GC
4046{
4047 return proto->memory_pressure != NULL ?
4048 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4049}
1da177e4
LT
4050
4051static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4052{
180d8cd9 4053
8d987e5c 4054 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
dc97391e 4055 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
1da177e4
LT
4056 proto->name,
4057 proto->obj_size,
14e943db 4058 sock_prot_inuse_get(seq_file_net(seq), proto),
180d8cd9
GC
4059 sock_prot_memory_allocated(proto),
4060 sock_prot_memory_pressure(proto),
1da177e4
LT
4061 proto->max_header,
4062 proto->slab == NULL ? "no" : "yes",
4063 module_name(proto->owner),
4064 proto_method_implemented(proto->close),
4065 proto_method_implemented(proto->connect),
4066 proto_method_implemented(proto->disconnect),
4067 proto_method_implemented(proto->accept),
4068 proto_method_implemented(proto->ioctl),
4069 proto_method_implemented(proto->init),
4070 proto_method_implemented(proto->destroy),
4071 proto_method_implemented(proto->shutdown),
4072 proto_method_implemented(proto->setsockopt),
4073 proto_method_implemented(proto->getsockopt),
4074 proto_method_implemented(proto->sendmsg),
4075 proto_method_implemented(proto->recvmsg),
1da177e4
LT
4076 proto_method_implemented(proto->bind),
4077 proto_method_implemented(proto->backlog_rcv),
4078 proto_method_implemented(proto->hash),
4079 proto_method_implemented(proto->unhash),
4080 proto_method_implemented(proto->get_port),
4081 proto_method_implemented(proto->enter_memory_pressure));
4082}
4083
4084static int proto_seq_show(struct seq_file *seq, void *v)
4085{
60f0438a 4086 if (v == &proto_list)
1da177e4
LT
4087 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4088 "protocol",
4089 "size",
4090 "sockets",
4091 "memory",
4092 "press",
4093 "maxhdr",
4094 "slab",
4095 "module",
dc97391e 4096 "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
1da177e4 4097 else
60f0438a 4098 proto_seq_printf(seq, list_entry(v, struct proto, node));
1da177e4
LT
4099 return 0;
4100}
4101
f690808e 4102static const struct seq_operations proto_seq_ops = {
1da177e4
LT
4103 .start = proto_seq_start,
4104 .next = proto_seq_next,
4105 .stop = proto_seq_stop,
4106 .show = proto_seq_show,
4107};
4108
14e943db
ED
4109static __net_init int proto_init_net(struct net *net)
4110{
c3506372
CH
4111 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4112 sizeof(struct seq_net_private)))
14e943db
ED
4113 return -ENOMEM;
4114
4115 return 0;
4116}
4117
4118static __net_exit void proto_exit_net(struct net *net)
4119{
ece31ffd 4120 remove_proc_entry("protocols", net->proc_net);
14e943db
ED
4121}
4122
4123
4124static __net_initdata struct pernet_operations proto_net_ops = {
4125 .init = proto_init_net,
4126 .exit = proto_exit_net,
1da177e4
LT
4127};
4128
4129static int __init proto_init(void)
4130{
14e943db 4131 return register_pernet_subsys(&proto_net_ops);
1da177e4
LT
4132}
4133
4134subsys_initcall(proto_init);
4135
4136#endif /* PROC_FS */
7db6b048
SS
4137
4138#ifdef CONFIG_NET_RX_BUSY_POLL
4139bool sk_busy_loop_end(void *p, unsigned long start_time)
4140{
4141 struct sock *sk = p;
4142
3f926af3 4143 return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
7db6b048
SS
4144 sk_busy_loop_timeout(sk, start_time);
4145}
4146EXPORT_SYMBOL(sk_busy_loop_end);
4147#endif /* CONFIG_NET_RX_BUSY_POLL */
c0425a42
CH
4148
4149int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4150{
4151 if (!sk->sk_prot->bind_add)
4152 return -EOPNOTSUPP;
4153 return sk->sk_prot->bind_add(sk, addr, addr_len);
4154}
4155EXPORT_SYMBOL(sock_bind_add);
e1d001fa
BL
4156
4157/* Copy 'size' bytes from userspace and return `size` back to userspace */
4158int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4159 void __user *arg, void *karg, size_t size)
4160{
4161 int ret;
4162
4163 if (copy_from_user(karg, arg, size))
4164 return -EFAULT;
4165
4166 ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4167 if (ret)
4168 return ret;
4169
4170 if (copy_to_user(arg, karg, size))
4171 return -EFAULT;
4172
4173 return 0;
4174}
4175EXPORT_SYMBOL(sock_ioctl_inout);
4176
4177/* This is the most common ioctl prep function, where the result (4 bytes) is
4178 * copied back to userspace if the ioctl() returns successfully. No input is
4179 * copied from userspace as input argument.
4180 */
4181static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4182{
4183 int ret, karg = 0;
4184
4185 ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4186 if (ret)
4187 return ret;
4188
4189 return put_user(karg, (int __user *)arg);
4190}
4191
4192/* A wrapper around sock ioctls, which copies the data from userspace
4193 * (depending on the protocol/ioctl), and copies back the result to userspace.
4194 * The main motivation for this function is to pass kernel memory to the
4195 * protocol ioctl callbacks, instead of userspace memory.
4196 */
4197int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4198{
4199 int rc = 1;
4200
634236b3 4201 if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
e1d001fa 4202 rc = ipmr_sk_ioctl(sk, cmd, arg);
634236b3 4203 else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
e1d001fa
BL
4204 rc = ip6mr_sk_ioctl(sk, cmd, arg);
4205 else if (sk_is_phonet(sk))
4206 rc = phonet_sk_ioctl(sk, cmd, arg);
4207
4208 /* If ioctl was processed, returns its value */
4209 if (rc <= 0)
4210 return rc;
4211
4212 /* Otherwise call the default handler */
4213 return sock_ioctl_out(sk, cmd, arg);
4214}
4215EXPORT_SYMBOL(sk_ioctl);