]> git.ipfire.org Git - thirdparty/linux.git/blame - net/core/sock.c
neighbor: Improve garbage collection
[thirdparty/linux.git] / net / core / sock.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
8 *
9 *
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
4ec93edb 35 * code. The ACK stuff can wait and needs major
1da177e4
LT
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
90 */
91
e005d193
JP
92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93
80b14dee 94#include <asm/unaligned.h>
4fc268d2 95#include <linux/capability.h>
1da177e4 96#include <linux/errno.h>
cb820f8e 97#include <linux/errqueue.h>
1da177e4
LT
98#include <linux/types.h>
99#include <linux/socket.h>
100#include <linux/in.h>
101#include <linux/kernel.h>
1da177e4
LT
102#include <linux/module.h>
103#include <linux/proc_fs.h>
104#include <linux/seq_file.h>
105#include <linux/sched.h>
f1083048 106#include <linux/sched/mm.h>
1da177e4
LT
107#include <linux/timer.h>
108#include <linux/string.h>
109#include <linux/sockios.h>
110#include <linux/net.h>
111#include <linux/mm.h>
112#include <linux/slab.h>
113#include <linux/interrupt.h>
114#include <linux/poll.h>
115#include <linux/tcp.h>
116#include <linux/init.h>
a1f8e7f7 117#include <linux/highmem.h>
3f551f94 118#include <linux/user_namespace.h>
c5905afb 119#include <linux/static_key.h>
3969eb38 120#include <linux/memcontrol.h>
8c1ae10d 121#include <linux/prefetch.h>
1da177e4 122
7c0f6ba6 123#include <linux/uaccess.h>
1da177e4
LT
124
125#include <linux/netdevice.h>
126#include <net/protocol.h>
127#include <linux/skbuff.h>
457c4cbc 128#include <net/net_namespace.h>
2e6599cb 129#include <net/request_sock.h>
1da177e4 130#include <net/sock.h>
20d49473 131#include <linux/net_tstamp.h>
1da177e4
LT
132#include <net/xfrm.h>
133#include <linux/ipsec.h>
f8451725 134#include <net/cls_cgroup.h>
5bc1421e 135#include <net/netprio_cgroup.h>
eb4cb008 136#include <linux/sock_diag.h>
1da177e4
LT
137
138#include <linux/filter.h>
538950a1 139#include <net/sock_reuseport.h>
1da177e4 140
3847ce32
SM
141#include <trace/events/sock.h>
142
1da177e4 143#include <net/tcp.h>
076bb0c8 144#include <net/busy_poll.h>
06021292 145
36b77a52 146static DEFINE_MUTEX(proto_list_mutex);
d1a4c0b3
GC
147static LIST_HEAD(proto_list);
148
648845ab
TZ
149static void sock_inuse_add(struct net *net, int val);
150
a3b299da
EB
151/**
152 * sk_ns_capable - General socket capability test
153 * @sk: Socket to use a capability on or through
154 * @user_ns: The user namespace of the capability to use
155 * @cap: The capability to use
156 *
157 * Test to see if the opener of the socket had when the socket was
158 * created and the current process has the capability @cap in the user
159 * namespace @user_ns.
160 */
161bool sk_ns_capable(const struct sock *sk,
162 struct user_namespace *user_ns, int cap)
163{
164 return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
165 ns_capable(user_ns, cap);
166}
167EXPORT_SYMBOL(sk_ns_capable);
168
169/**
170 * sk_capable - Socket global capability test
171 * @sk: Socket to use a capability on or through
e793c0f7 172 * @cap: The global capability to use
a3b299da
EB
173 *
174 * Test to see if the opener of the socket had when the socket was
175 * created and the current process has the capability @cap in all user
176 * namespaces.
177 */
178bool sk_capable(const struct sock *sk, int cap)
179{
180 return sk_ns_capable(sk, &init_user_ns, cap);
181}
182EXPORT_SYMBOL(sk_capable);
183
184/**
185 * sk_net_capable - Network namespace socket capability test
186 * @sk: Socket to use a capability on or through
187 * @cap: The capability to use
188 *
e793c0f7 189 * Test to see if the opener of the socket had when the socket was created
a3b299da
EB
190 * and the current process has the capability @cap over the network namespace
191 * the socket is a member of.
192 */
193bool sk_net_capable(const struct sock *sk, int cap)
194{
195 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
196}
197EXPORT_SYMBOL(sk_net_capable);
198
da21f24d
IM
199/*
200 * Each address family might have different locking rules, so we have
cdfbabfb
DH
201 * one slock key per address family and separate keys for internal and
202 * userspace sockets.
da21f24d 203 */
a5b5bb9a 204static struct lock_class_key af_family_keys[AF_MAX];
cdfbabfb 205static struct lock_class_key af_family_kern_keys[AF_MAX];
a5b5bb9a 206static struct lock_class_key af_family_slock_keys[AF_MAX];
cdfbabfb 207static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
a5b5bb9a 208
a5b5bb9a
IM
209/*
210 * Make lock validator output more readable. (we pre-construct these
211 * strings build-time, so that runtime initialization of socket
212 * locks is fast):
213 */
cdfbabfb
DH
214
215#define _sock_locks(x) \
216 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
217 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
218 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
219 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
220 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
221 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
222 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
223 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
224 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
225 x "27" , x "28" , x "AF_CAN" , \
226 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
227 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
228 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
229 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
68e8b849
BT
230 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
231 x "AF_MAX"
cdfbabfb 232
36cbd3dc 233static const char *const af_family_key_strings[AF_MAX+1] = {
cdfbabfb 234 _sock_locks("sk_lock-")
a5b5bb9a 235};
36cbd3dc 236static const char *const af_family_slock_key_strings[AF_MAX+1] = {
cdfbabfb 237 _sock_locks("slock-")
a5b5bb9a 238};
36cbd3dc 239static const char *const af_family_clock_key_strings[AF_MAX+1] = {
cdfbabfb
DH
240 _sock_locks("clock-")
241};
242
243static const char *const af_family_kern_key_strings[AF_MAX+1] = {
244 _sock_locks("k-sk_lock-")
245};
246static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
247 _sock_locks("k-slock-")
248};
249static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
250 _sock_locks("k-clock-")
443aef0e 251};
581319c5 252static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
6b431d50 253 _sock_locks("rlock-")
581319c5
PA
254};
255static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
6b431d50 256 _sock_locks("wlock-")
581319c5
PA
257};
258static const char *const af_family_elock_key_strings[AF_MAX+1] = {
6b431d50 259 _sock_locks("elock-")
581319c5 260};
da21f24d
IM
261
262/*
581319c5 263 * sk_callback_lock and sk queues locking rules are per-address-family,
da21f24d
IM
264 * so split the lock classes by using a per-AF key:
265 */
266static struct lock_class_key af_callback_keys[AF_MAX];
581319c5
PA
267static struct lock_class_key af_rlock_keys[AF_MAX];
268static struct lock_class_key af_wlock_keys[AF_MAX];
269static struct lock_class_key af_elock_keys[AF_MAX];
cdfbabfb 270static struct lock_class_key af_kern_callback_keys[AF_MAX];
da21f24d 271
1da177e4 272/* Run time adjustable parameters. */
ab32ea5d 273__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
6d8ebc8a 274EXPORT_SYMBOL(sysctl_wmem_max);
ab32ea5d 275__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
6d8ebc8a 276EXPORT_SYMBOL(sysctl_rmem_max);
ab32ea5d
BH
277__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
278__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
1da177e4 279
25985edc 280/* Maximal space eaten by iovec or ancillary data plus some space */
ab32ea5d 281int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
2a91525c 282EXPORT_SYMBOL(sysctl_optmem_max);
1da177e4 283
b245be1f
WB
284int sysctl_tstamp_allow_data __read_mostly = 1;
285
a7950ae8
DB
286DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
287EXPORT_SYMBOL_GPL(memalloc_socks_key);
c93bdd0e 288
7cb02404
MG
289/**
290 * sk_set_memalloc - sets %SOCK_MEMALLOC
291 * @sk: socket to set it on
292 *
293 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
294 * It's the responsibility of the admin to adjust min_free_kbytes
295 * to meet the requirements
296 */
297void sk_set_memalloc(struct sock *sk)
298{
299 sock_set_flag(sk, SOCK_MEMALLOC);
300 sk->sk_allocation |= __GFP_MEMALLOC;
a7950ae8 301 static_branch_inc(&memalloc_socks_key);
7cb02404
MG
302}
303EXPORT_SYMBOL_GPL(sk_set_memalloc);
304
305void sk_clear_memalloc(struct sock *sk)
306{
307 sock_reset_flag(sk, SOCK_MEMALLOC);
308 sk->sk_allocation &= ~__GFP_MEMALLOC;
a7950ae8 309 static_branch_dec(&memalloc_socks_key);
c76562b6
MG
310
311 /*
312 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
5d753610
MG
313 * progress of swapping. SOCK_MEMALLOC may be cleared while
314 * it has rmem allocations due to the last swapfile being deactivated
315 * but there is a risk that the socket is unusable due to exceeding
316 * the rmem limits. Reclaim the reserves and obey rmem limits again.
c76562b6 317 */
5d753610 318 sk_mem_reclaim(sk);
7cb02404
MG
319}
320EXPORT_SYMBOL_GPL(sk_clear_memalloc);
321
b4b9e355
MG
322int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
323{
324 int ret;
f1083048 325 unsigned int noreclaim_flag;
b4b9e355
MG
326
327 /* these should have been dropped before queueing */
328 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
329
f1083048 330 noreclaim_flag = memalloc_noreclaim_save();
b4b9e355 331 ret = sk->sk_backlog_rcv(sk, skb);
f1083048 332 memalloc_noreclaim_restore(noreclaim_flag);
b4b9e355
MG
333
334 return ret;
335}
336EXPORT_SYMBOL(__sk_backlog_rcv);
337
1da177e4
LT
338static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
339{
340 struct timeval tv;
341
342 if (optlen < sizeof(tv))
343 return -EINVAL;
344 if (copy_from_user(&tv, optval, sizeof(tv)))
345 return -EFAULT;
ba78073e
VA
346 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
347 return -EDOM;
1da177e4 348
ba78073e 349 if (tv.tv_sec < 0) {
6f11df83
AM
350 static int warned __read_mostly;
351
ba78073e 352 *timeo_p = 0;
50aab54f 353 if (warned < 10 && net_ratelimit()) {
ba78073e 354 warned++;
e005d193
JP
355 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
356 __func__, current->comm, task_pid_nr(current));
50aab54f 357 }
ba78073e
VA
358 return 0;
359 }
1da177e4
LT
360 *timeo_p = MAX_SCHEDULE_TIMEOUT;
361 if (tv.tv_sec == 0 && tv.tv_usec == 0)
362 return 0;
363 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
8ccde4c5 364 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
1da177e4
LT
365 return 0;
366}
367
368static void sock_warn_obsolete_bsdism(const char *name)
369{
370 static int warned;
371 static char warncomm[TASK_COMM_LEN];
4ec93edb
YH
372 if (strcmp(warncomm, current->comm) && warned < 5) {
373 strcpy(warncomm, current->comm);
e005d193
JP
374 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
375 warncomm, name);
1da177e4
LT
376 warned++;
377 }
378}
379
080a270f
HFS
380static bool sock_needs_netstamp(const struct sock *sk)
381{
382 switch (sk->sk_family) {
383 case AF_UNSPEC:
384 case AF_UNIX:
385 return false;
386 default:
387 return true;
388 }
389}
390
08e29af3 391static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
4ec93edb 392{
08e29af3
ED
393 if (sk->sk_flags & flags) {
394 sk->sk_flags &= ~flags;
080a270f
HFS
395 if (sock_needs_netstamp(sk) &&
396 !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
20d49473 397 net_disable_timestamp();
1da177e4
LT
398 }
399}
400
401
e6afc8ac 402int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
f0088a50 403{
3b885787
NH
404 unsigned long flags;
405 struct sk_buff_head *list = &sk->sk_receive_queue;
f0088a50 406
0fd7bac6 407 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
766e9037 408 atomic_inc(&sk->sk_drops);
3847ce32 409 trace_sock_rcvqueue_full(sk, skb);
766e9037 410 return -ENOMEM;
f0088a50
DV
411 }
412
c76562b6 413 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
766e9037
ED
414 atomic_inc(&sk->sk_drops);
415 return -ENOBUFS;
3ab224be
HA
416 }
417
f0088a50
DV
418 skb->dev = NULL;
419 skb_set_owner_r(skb, sk);
49ad9599 420
7fee226a
ED
421 /* we escape from rcu protected region, make sure we dont leak
422 * a norefcounted dst
423 */
424 skb_dst_force(skb);
425
3b885787 426 spin_lock_irqsave(&list->lock, flags);
3bc3b96f 427 sock_skb_set_dropcount(sk, skb);
3b885787
NH
428 __skb_queue_tail(list, skb);
429 spin_unlock_irqrestore(&list->lock, flags);
f0088a50
DV
430
431 if (!sock_flag(sk, SOCK_DEAD))
676d2369 432 sk->sk_data_ready(sk);
766e9037 433 return 0;
f0088a50 434}
e6afc8ac 435EXPORT_SYMBOL(__sock_queue_rcv_skb);
436
437int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
438{
439 int err;
440
441 err = sk_filter(sk, skb);
442 if (err)
443 return err;
444
445 return __sock_queue_rcv_skb(sk, skb);
446}
f0088a50
DV
447EXPORT_SYMBOL(sock_queue_rcv_skb);
448
4f0c40d9 449int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
c3f24cfb 450 const int nested, unsigned int trim_cap, bool refcounted)
f0088a50
DV
451{
452 int rc = NET_RX_SUCCESS;
453
4f0c40d9 454 if (sk_filter_trim_cap(sk, skb, trim_cap))
f0088a50
DV
455 goto discard_and_relse;
456
457 skb->dev = NULL;
458
274f482d 459 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
c377411f
ED
460 atomic_inc(&sk->sk_drops);
461 goto discard_and_relse;
462 }
58a5a7b9
ACM
463 if (nested)
464 bh_lock_sock_nested(sk);
465 else
466 bh_lock_sock(sk);
a5b5bb9a
IM
467 if (!sock_owned_by_user(sk)) {
468 /*
469 * trylock + unlock semantics:
470 */
471 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
472
c57943a1 473 rc = sk_backlog_rcv(sk, skb);
a5b5bb9a
IM
474
475 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
f545a38f 476 } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
8eae939f
ZY
477 bh_unlock_sock(sk);
478 atomic_inc(&sk->sk_drops);
479 goto discard_and_relse;
480 }
481
f0088a50
DV
482 bh_unlock_sock(sk);
483out:
c3f24cfb
ED
484 if (refcounted)
485 sock_put(sk);
f0088a50
DV
486 return rc;
487discard_and_relse:
488 kfree_skb(skb);
489 goto out;
490}
4f0c40d9 491EXPORT_SYMBOL(__sk_receive_skb);
f0088a50
DV
492
493struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
494{
b6c6712a 495 struct dst_entry *dst = __sk_dst_get(sk);
f0088a50
DV
496
497 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
e022f0b4 498 sk_tx_queue_clear(sk);
9b8805a3 499 sk->sk_dst_pending_confirm = 0;
a9b3cd7f 500 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
f0088a50
DV
501 dst_release(dst);
502 return NULL;
503 }
504
505 return dst;
506}
507EXPORT_SYMBOL(__sk_dst_check);
508
509struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
510{
511 struct dst_entry *dst = sk_dst_get(sk);
512
513 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
514 sk_dst_reset(sk);
515 dst_release(dst);
516 return NULL;
517 }
518
519 return dst;
520}
521EXPORT_SYMBOL(sk_dst_check);
522
c91f6df2
BH
523static int sock_setbindtodevice(struct sock *sk, char __user *optval,
524 int optlen)
4878809f
DM
525{
526 int ret = -ENOPROTOOPT;
527#ifdef CONFIG_NETDEVICES
3b1e0a65 528 struct net *net = sock_net(sk);
4878809f
DM
529 char devname[IFNAMSIZ];
530 int index;
531
532 /* Sorry... */
533 ret = -EPERM;
5e1fccc0 534 if (!ns_capable(net->user_ns, CAP_NET_RAW))
4878809f
DM
535 goto out;
536
537 ret = -EINVAL;
538 if (optlen < 0)
539 goto out;
540
541 /* Bind this socket to a particular device like "eth0",
542 * as specified in the passed interface name. If the
543 * name is "" or the option length is zero the socket
544 * is not bound.
545 */
546 if (optlen > IFNAMSIZ - 1)
547 optlen = IFNAMSIZ - 1;
548 memset(devname, 0, sizeof(devname));
549
550 ret = -EFAULT;
551 if (copy_from_user(devname, optval, optlen))
552 goto out;
553
000ba2e4
DM
554 index = 0;
555 if (devname[0] != '\0') {
bf8e56bf 556 struct net_device *dev;
4878809f 557
bf8e56bf
ED
558 rcu_read_lock();
559 dev = dev_get_by_name_rcu(net, devname);
560 if (dev)
561 index = dev->ifindex;
562 rcu_read_unlock();
4878809f
DM
563 ret = -ENODEV;
564 if (!dev)
565 goto out;
4878809f
DM
566 }
567
568 lock_sock(sk);
569 sk->sk_bound_dev_if = index;
6da5b0f0
MM
570 if (sk->sk_prot->rehash)
571 sk->sk_prot->rehash(sk);
4878809f
DM
572 sk_dst_reset(sk);
573 release_sock(sk);
574
575 ret = 0;
576
577out:
578#endif
579
580 return ret;
581}
582
c91f6df2
BH
583static int sock_getbindtodevice(struct sock *sk, char __user *optval,
584 int __user *optlen, int len)
585{
586 int ret = -ENOPROTOOPT;
587#ifdef CONFIG_NETDEVICES
588 struct net *net = sock_net(sk);
c91f6df2 589 char devname[IFNAMSIZ];
c91f6df2
BH
590
591 if (sk->sk_bound_dev_if == 0) {
592 len = 0;
593 goto zero;
594 }
595
596 ret = -EINVAL;
597 if (len < IFNAMSIZ)
598 goto out;
599
5dbe7c17
NS
600 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
601 if (ret)
c91f6df2 602 goto out;
c91f6df2
BH
603
604 len = strlen(devname) + 1;
605
606 ret = -EFAULT;
607 if (copy_to_user(optval, devname, len))
608 goto out;
609
610zero:
611 ret = -EFAULT;
612 if (put_user(len, optlen))
613 goto out;
614
615 ret = 0;
616
617out:
618#endif
619
620 return ret;
621}
622
c0ef877b
PE
623static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
624{
625 if (valbool)
626 sock_set_flag(sk, bit);
627 else
628 sock_reset_flag(sk, bit);
629}
630
f60e5990 631bool sk_mc_loop(struct sock *sk)
632{
633 if (dev_recursion_level())
634 return false;
635 if (!sk)
636 return true;
637 switch (sk->sk_family) {
638 case AF_INET:
639 return inet_sk(sk)->mc_loop;
640#if IS_ENABLED(CONFIG_IPV6)
641 case AF_INET6:
642 return inet6_sk(sk)->mc_loop;
643#endif
644 }
645 WARN_ON(1);
646 return true;
647}
648EXPORT_SYMBOL(sk_mc_loop);
649
1da177e4
LT
650/*
651 * This is meant for all protocols to use and covers goings on
652 * at the socket level. Everything here is generic.
653 */
654
655int sock_setsockopt(struct socket *sock, int level, int optname,
b7058842 656 char __user *optval, unsigned int optlen)
1da177e4 657{
80b14dee 658 struct sock_txtime sk_txtime;
2a91525c 659 struct sock *sk = sock->sk;
1da177e4
LT
660 int val;
661 int valbool;
662 struct linger ling;
663 int ret = 0;
4ec93edb 664
1da177e4
LT
665 /*
666 * Options without arguments
667 */
668
4878809f 669 if (optname == SO_BINDTODEVICE)
c91f6df2 670 return sock_setbindtodevice(sk, optval, optlen);
4878809f 671
e71a4783
SH
672 if (optlen < sizeof(int))
673 return -EINVAL;
4ec93edb 674
1da177e4
LT
675 if (get_user(val, (int __user *)optval))
676 return -EFAULT;
4ec93edb 677
2a91525c 678 valbool = val ? 1 : 0;
1da177e4
LT
679
680 lock_sock(sk);
681
2a91525c 682 switch (optname) {
e71a4783 683 case SO_DEBUG:
2a91525c 684 if (val && !capable(CAP_NET_ADMIN))
e71a4783 685 ret = -EACCES;
2a91525c 686 else
c0ef877b 687 sock_valbool_flag(sk, SOCK_DBG, valbool);
e71a4783
SH
688 break;
689 case SO_REUSEADDR:
cdb8744d 690 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
e71a4783 691 break;
055dc21a
TH
692 case SO_REUSEPORT:
693 sk->sk_reuseport = valbool;
694 break;
e71a4783 695 case SO_TYPE:
49c794e9 696 case SO_PROTOCOL:
0d6038ee 697 case SO_DOMAIN:
e71a4783
SH
698 case SO_ERROR:
699 ret = -ENOPROTOOPT;
700 break;
701 case SO_DONTROUTE:
c0ef877b 702 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
e71a4783
SH
703 break;
704 case SO_BROADCAST:
705 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
706 break;
707 case SO_SNDBUF:
708 /* Don't error on this BSD doesn't and if you think
82981930
ED
709 * about it this is right. Otherwise apps have to
710 * play 'guess the biggest size' games. RCVBUF/SNDBUF
711 * are treated in BSD as hints
712 */
713 val = min_t(u32, val, sysctl_wmem_max);
b0573dea 714set_sndbuf:
e71a4783 715 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
b98b0bc8 716 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
82981930 717 /* Wake up sending tasks if we upped the value. */
e71a4783
SH
718 sk->sk_write_space(sk);
719 break;
1da177e4 720
e71a4783
SH
721 case SO_SNDBUFFORCE:
722 if (!capable(CAP_NET_ADMIN)) {
723 ret = -EPERM;
724 break;
725 }
726 goto set_sndbuf;
b0573dea 727
e71a4783
SH
728 case SO_RCVBUF:
729 /* Don't error on this BSD doesn't and if you think
82981930
ED
730 * about it this is right. Otherwise apps have to
731 * play 'guess the biggest size' games. RCVBUF/SNDBUF
732 * are treated in BSD as hints
733 */
734 val = min_t(u32, val, sysctl_rmem_max);
b0573dea 735set_rcvbuf:
e71a4783
SH
736 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
737 /*
738 * We double it on the way in to account for
739 * "struct sk_buff" etc. overhead. Applications
740 * assume that the SO_RCVBUF setting they make will
741 * allow that much actual data to be received on that
742 * socket.
743 *
744 * Applications are unaware that "struct sk_buff" and
745 * other overheads allocate from the receive buffer
746 * during socket buffer allocation.
747 *
748 * And after considering the possible alternatives,
749 * returning the value we actually used in getsockopt
750 * is the most desirable behavior.
751 */
b98b0bc8 752 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
e71a4783
SH
753 break;
754
755 case SO_RCVBUFFORCE:
756 if (!capable(CAP_NET_ADMIN)) {
757 ret = -EPERM;
1da177e4 758 break;
e71a4783
SH
759 }
760 goto set_rcvbuf;
1da177e4 761
e71a4783 762 case SO_KEEPALIVE:
4b9d07a4
UB
763 if (sk->sk_prot->keepalive)
764 sk->sk_prot->keepalive(sk, valbool);
e71a4783
SH
765 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
766 break;
767
768 case SO_OOBINLINE:
769 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
770 break;
771
772 case SO_NO_CHECK:
28448b80 773 sk->sk_no_check_tx = valbool;
e71a4783
SH
774 break;
775
776 case SO_PRIORITY:
5e1fccc0
EB
777 if ((val >= 0 && val <= 6) ||
778 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
e71a4783
SH
779 sk->sk_priority = val;
780 else
781 ret = -EPERM;
782 break;
783
784 case SO_LINGER:
785 if (optlen < sizeof(ling)) {
786 ret = -EINVAL; /* 1003.1g */
1da177e4 787 break;
e71a4783 788 }
2a91525c 789 if (copy_from_user(&ling, optval, sizeof(ling))) {
e71a4783 790 ret = -EFAULT;
1da177e4 791 break;
e71a4783
SH
792 }
793 if (!ling.l_onoff)
794 sock_reset_flag(sk, SOCK_LINGER);
795 else {
1da177e4 796#if (BITS_PER_LONG == 32)
e71a4783
SH
797 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
798 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1da177e4 799 else
e71a4783
SH
800#endif
801 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
802 sock_set_flag(sk, SOCK_LINGER);
803 }
804 break;
805
806 case SO_BSDCOMPAT:
807 sock_warn_obsolete_bsdism("setsockopt");
808 break;
809
810 case SO_PASSCRED:
811 if (valbool)
812 set_bit(SOCK_PASSCRED, &sock->flags);
813 else
814 clear_bit(SOCK_PASSCRED, &sock->flags);
815 break;
816
817 case SO_TIMESTAMP:
92f37fd2 818 case SO_TIMESTAMPNS:
e71a4783 819 if (valbool) {
92f37fd2
ED
820 if (optname == SO_TIMESTAMP)
821 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
822 else
823 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
e71a4783 824 sock_set_flag(sk, SOCK_RCVTSTAMP);
20d49473 825 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
92f37fd2 826 } else {
e71a4783 827 sock_reset_flag(sk, SOCK_RCVTSTAMP);
92f37fd2
ED
828 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
829 }
e71a4783
SH
830 break;
831
20d49473
PO
832 case SO_TIMESTAMPING:
833 if (val & ~SOF_TIMESTAMPING_MASK) {
f249fb78 834 ret = -EINVAL;
20d49473
PO
835 break;
836 }
b245be1f 837
09c2d251 838 if (val & SOF_TIMESTAMPING_OPT_ID &&
4ed2d765 839 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
ac5cc977
WC
840 if (sk->sk_protocol == IPPROTO_TCP &&
841 sk->sk_type == SOCK_STREAM) {
6db8b963
SHY
842 if ((1 << sk->sk_state) &
843 (TCPF_CLOSE | TCPF_LISTEN)) {
4ed2d765
WB
844 ret = -EINVAL;
845 break;
846 }
847 sk->sk_tskey = tcp_sk(sk)->snd_una;
848 } else {
849 sk->sk_tskey = 0;
850 }
851 }
1c885808
FY
852
853 if (val & SOF_TIMESTAMPING_OPT_STATS &&
854 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
855 ret = -EINVAL;
856 break;
857 }
858
b9f40e21 859 sk->sk_tsflags = val;
20d49473
PO
860 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
861 sock_enable_timestamp(sk,
862 SOCK_TIMESTAMPING_RX_SOFTWARE);
863 else
864 sock_disable_timestamp(sk,
08e29af3 865 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
20d49473
PO
866 break;
867
e71a4783
SH
868 case SO_RCVLOWAT:
869 if (val < 0)
870 val = INT_MAX;
d1361840
ED
871 if (sock->ops->set_rcvlowat)
872 ret = sock->ops->set_rcvlowat(sk, val);
873 else
874 sk->sk_rcvlowat = val ? : 1;
e71a4783
SH
875 break;
876
877 case SO_RCVTIMEO:
878 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
879 break;
880
881 case SO_SNDTIMEO:
882 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
883 break;
1da177e4 884
e71a4783
SH
885 case SO_ATTACH_FILTER:
886 ret = -EINVAL;
887 if (optlen == sizeof(struct sock_fprog)) {
888 struct sock_fprog fprog;
1da177e4 889
e71a4783
SH
890 ret = -EFAULT;
891 if (copy_from_user(&fprog, optval, sizeof(fprog)))
1da177e4 892 break;
e71a4783
SH
893
894 ret = sk_attach_filter(&fprog, sk);
895 }
896 break;
897
89aa0758
AS
898 case SO_ATTACH_BPF:
899 ret = -EINVAL;
900 if (optlen == sizeof(u32)) {
901 u32 ufd;
902
903 ret = -EFAULT;
904 if (copy_from_user(&ufd, optval, sizeof(ufd)))
905 break;
906
907 ret = sk_attach_bpf(ufd, sk);
908 }
909 break;
910
538950a1
CG
911 case SO_ATTACH_REUSEPORT_CBPF:
912 ret = -EINVAL;
913 if (optlen == sizeof(struct sock_fprog)) {
914 struct sock_fprog fprog;
915
916 ret = -EFAULT;
917 if (copy_from_user(&fprog, optval, sizeof(fprog)))
918 break;
919
920 ret = sk_reuseport_attach_filter(&fprog, sk);
921 }
922 break;
923
924 case SO_ATTACH_REUSEPORT_EBPF:
925 ret = -EINVAL;
926 if (optlen == sizeof(u32)) {
927 u32 ufd;
928
929 ret = -EFAULT;
930 if (copy_from_user(&ufd, optval, sizeof(ufd)))
931 break;
932
933 ret = sk_reuseport_attach_bpf(ufd, sk);
934 }
935 break;
936
e71a4783 937 case SO_DETACH_FILTER:
55b33325 938 ret = sk_detach_filter(sk);
e71a4783 939 break;
1da177e4 940
d59577b6
VB
941 case SO_LOCK_FILTER:
942 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
943 ret = -EPERM;
944 else
945 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
946 break;
947
e71a4783
SH
948 case SO_PASSSEC:
949 if (valbool)
950 set_bit(SOCK_PASSSEC, &sock->flags);
951 else
952 clear_bit(SOCK_PASSSEC, &sock->flags);
953 break;
4a19ec58 954 case SO_MARK:
50254256 955 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
4a19ec58 956 ret = -EPERM;
50254256 957 } else if (val != sk->sk_mark) {
4a19ec58 958 sk->sk_mark = val;
50254256
DB
959 sk_dst_reset(sk);
960 }
4a19ec58 961 break;
877ce7c1 962
3b885787 963 case SO_RXQ_OVFL:
8083f0fc 964 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
3b885787 965 break;
6e3e939f
JB
966
967 case SO_WIFI_STATUS:
968 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
969 break;
970
ef64a54f
PE
971 case SO_PEEK_OFF:
972 if (sock->ops->set_peek_off)
12663bfc 973 ret = sock->ops->set_peek_off(sk, val);
ef64a54f
PE
974 else
975 ret = -EOPNOTSUPP;
976 break;
3bdc0eba
BG
977
978 case SO_NOFCS:
979 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
980 break;
981
7d4c04fc
KJ
982 case SO_SELECT_ERR_QUEUE:
983 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
984 break;
985
e0d1095a 986#ifdef CONFIG_NET_RX_BUSY_POLL
64b0dc51 987 case SO_BUSY_POLL:
dafcc438
ET
988 /* allow unprivileged users to decrease the value */
989 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
990 ret = -EPERM;
991 else {
992 if (val < 0)
993 ret = -EINVAL;
994 else
995 sk->sk_ll_usec = val;
996 }
997 break;
998#endif
62748f32
ED
999
1000 case SO_MAX_PACING_RATE:
218af599
ED
1001 if (val != ~0U)
1002 cmpxchg(&sk->sk_pacing_status,
1003 SK_PACING_NONE,
1004 SK_PACING_NEEDED);
76a9ebe8 1005 sk->sk_max_pacing_rate = (val == ~0U) ? ~0UL : val;
62748f32
ED
1006 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1007 sk->sk_max_pacing_rate);
1008 break;
1009
70da268b
ED
1010 case SO_INCOMING_CPU:
1011 sk->sk_incoming_cpu = val;
1012 break;
1013
a87cb3e4
TH
1014 case SO_CNX_ADVICE:
1015 if (val == 1)
1016 dst_negative_advice(sk);
1017 break;
76851d12
WB
1018
1019 case SO_ZEROCOPY:
28190752 1020 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
b5947e5d
WB
1021 if (!((sk->sk_type == SOCK_STREAM &&
1022 sk->sk_protocol == IPPROTO_TCP) ||
1023 (sk->sk_type == SOCK_DGRAM &&
1024 sk->sk_protocol == IPPROTO_UDP)))
28190752 1025 ret = -ENOTSUPP;
28190752 1026 } else if (sk->sk_family != PF_RDS) {
76851d12 1027 ret = -ENOTSUPP;
28190752
SV
1028 }
1029 if (!ret) {
1030 if (val < 0 || val > 1)
1031 ret = -EINVAL;
1032 else
1033 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
28190752 1034 }
334e6413
JSP
1035 break;
1036
80b14dee
RC
1037 case SO_TXTIME:
1038 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1039 ret = -EPERM;
1040 } else if (optlen != sizeof(struct sock_txtime)) {
1041 ret = -EINVAL;
1042 } else if (copy_from_user(&sk_txtime, optval,
1043 sizeof(struct sock_txtime))) {
1044 ret = -EFAULT;
1045 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1046 ret = -EINVAL;
1047 } else {
1048 sock_valbool_flag(sk, SOCK_TXTIME, true);
1049 sk->sk_clockid = sk_txtime.clockid;
1050 sk->sk_txtime_deadline_mode =
1051 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
4b15c707
JSP
1052 sk->sk_txtime_report_errors =
1053 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
80b14dee
RC
1054 }
1055 break;
1056
e71a4783
SH
1057 default:
1058 ret = -ENOPROTOOPT;
1059 break;
4ec93edb 1060 }
1da177e4
LT
1061 release_sock(sk);
1062 return ret;
1063}
2a91525c 1064EXPORT_SYMBOL(sock_setsockopt);
1da177e4
LT
1065
1066
8f09898b 1067static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1068 struct ucred *ucred)
3f551f94
EB
1069{
1070 ucred->pid = pid_vnr(pid);
1071 ucred->uid = ucred->gid = -1;
1072 if (cred) {
1073 struct user_namespace *current_ns = current_user_ns();
1074
b2e4f544
EB
1075 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1076 ucred->gid = from_kgid_munged(current_ns, cred->egid);
3f551f94
EB
1077 }
1078}
1079
28b5ba2a
DR
1080static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1081{
1082 struct user_namespace *user_ns = current_user_ns();
1083 int i;
1084
1085 for (i = 0; i < src->ngroups; i++)
1086 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1087 return -EFAULT;
1088
1089 return 0;
1090}
1091
1da177e4
LT
1092int sock_getsockopt(struct socket *sock, int level, int optname,
1093 char __user *optval, int __user *optlen)
1094{
1095 struct sock *sk = sock->sk;
4ec93edb 1096
e71a4783 1097 union {
4ec93edb 1098 int val;
5daab9db 1099 u64 val64;
4ec93edb 1100 struct linger ling;
1da177e4 1101 struct timeval tm;
80b14dee 1102 struct sock_txtime txtime;
1da177e4 1103 } v;
4ec93edb 1104
4d0392be 1105 int lv = sizeof(int);
1da177e4 1106 int len;
4ec93edb 1107
e71a4783 1108 if (get_user(len, optlen))
4ec93edb 1109 return -EFAULT;
e71a4783 1110 if (len < 0)
1da177e4 1111 return -EINVAL;
4ec93edb 1112
50fee1de 1113 memset(&v, 0, sizeof(v));
df0bca04 1114
2a91525c 1115 switch (optname) {
e71a4783
SH
1116 case SO_DEBUG:
1117 v.val = sock_flag(sk, SOCK_DBG);
1118 break;
1119
1120 case SO_DONTROUTE:
1121 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1122 break;
1123
1124 case SO_BROADCAST:
1b23a5df 1125 v.val = sock_flag(sk, SOCK_BROADCAST);
e71a4783
SH
1126 break;
1127
1128 case SO_SNDBUF:
1129 v.val = sk->sk_sndbuf;
1130 break;
1131
1132 case SO_RCVBUF:
1133 v.val = sk->sk_rcvbuf;
1134 break;
1135
1136 case SO_REUSEADDR:
1137 v.val = sk->sk_reuse;
1138 break;
1139
055dc21a
TH
1140 case SO_REUSEPORT:
1141 v.val = sk->sk_reuseport;
1142 break;
1143
e71a4783 1144 case SO_KEEPALIVE:
1b23a5df 1145 v.val = sock_flag(sk, SOCK_KEEPOPEN);
e71a4783
SH
1146 break;
1147
1148 case SO_TYPE:
1149 v.val = sk->sk_type;
1150 break;
1151
49c794e9
JE
1152 case SO_PROTOCOL:
1153 v.val = sk->sk_protocol;
1154 break;
1155
0d6038ee
JE
1156 case SO_DOMAIN:
1157 v.val = sk->sk_family;
1158 break;
1159
e71a4783
SH
1160 case SO_ERROR:
1161 v.val = -sock_error(sk);
2a91525c 1162 if (v.val == 0)
e71a4783
SH
1163 v.val = xchg(&sk->sk_err_soft, 0);
1164 break;
1165
1166 case SO_OOBINLINE:
1b23a5df 1167 v.val = sock_flag(sk, SOCK_URGINLINE);
e71a4783
SH
1168 break;
1169
1170 case SO_NO_CHECK:
28448b80 1171 v.val = sk->sk_no_check_tx;
e71a4783
SH
1172 break;
1173
1174 case SO_PRIORITY:
1175 v.val = sk->sk_priority;
1176 break;
1177
1178 case SO_LINGER:
1179 lv = sizeof(v.ling);
1b23a5df 1180 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
e71a4783
SH
1181 v.ling.l_linger = sk->sk_lingertime / HZ;
1182 break;
1183
1184 case SO_BSDCOMPAT:
1185 sock_warn_obsolete_bsdism("getsockopt");
1186 break;
1187
1188 case SO_TIMESTAMP:
92f37fd2
ED
1189 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1190 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1191 break;
1192
1193 case SO_TIMESTAMPNS:
1194 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
e71a4783
SH
1195 break;
1196
20d49473 1197 case SO_TIMESTAMPING:
b9f40e21 1198 v.val = sk->sk_tsflags;
20d49473
PO
1199 break;
1200
e71a4783 1201 case SO_RCVTIMEO:
2a91525c 1202 lv = sizeof(struct timeval);
e71a4783
SH
1203 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1204 v.tm.tv_sec = 0;
1205 v.tm.tv_usec = 0;
1206 } else {
1207 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
8ccde4c5 1208 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
e71a4783
SH
1209 }
1210 break;
1211
1212 case SO_SNDTIMEO:
2a91525c 1213 lv = sizeof(struct timeval);
e71a4783
SH
1214 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1215 v.tm.tv_sec = 0;
1216 v.tm.tv_usec = 0;
1217 } else {
1218 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
8ccde4c5 1219 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
e71a4783
SH
1220 }
1221 break;
1da177e4 1222
e71a4783
SH
1223 case SO_RCVLOWAT:
1224 v.val = sk->sk_rcvlowat;
1225 break;
1da177e4 1226
e71a4783 1227 case SO_SNDLOWAT:
2a91525c 1228 v.val = 1;
e71a4783 1229 break;
1da177e4 1230
e71a4783 1231 case SO_PASSCRED:
82981930 1232 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
e71a4783 1233 break;
1da177e4 1234
e71a4783 1235 case SO_PEERCRED:
109f6e39
EB
1236 {
1237 struct ucred peercred;
1238 if (len > sizeof(peercred))
1239 len = sizeof(peercred);
1240 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1241 if (copy_to_user(optval, &peercred, len))
e71a4783
SH
1242 return -EFAULT;
1243 goto lenout;
109f6e39 1244 }
1da177e4 1245
28b5ba2a
DR
1246 case SO_PEERGROUPS:
1247 {
1248 int ret, n;
1249
1250 if (!sk->sk_peer_cred)
1251 return -ENODATA;
1252
1253 n = sk->sk_peer_cred->group_info->ngroups;
1254 if (len < n * sizeof(gid_t)) {
1255 len = n * sizeof(gid_t);
1256 return put_user(len, optlen) ? -EFAULT : -ERANGE;
1257 }
1258 len = n * sizeof(gid_t);
1259
1260 ret = groups_to_user((gid_t __user *)optval,
1261 sk->sk_peer_cred->group_info);
1262 if (ret)
1263 return ret;
1264 goto lenout;
1265 }
1266
e71a4783
SH
1267 case SO_PEERNAME:
1268 {
1269 char address[128];
1270
9b2c45d4
DV
1271 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1272 if (lv < 0)
e71a4783
SH
1273 return -ENOTCONN;
1274 if (lv < len)
1275 return -EINVAL;
1276 if (copy_to_user(optval, address, len))
1277 return -EFAULT;
1278 goto lenout;
1279 }
1da177e4 1280
e71a4783
SH
1281 /* Dubious BSD thing... Probably nobody even uses it, but
1282 * the UNIX standard wants it for whatever reason... -DaveM
1283 */
1284 case SO_ACCEPTCONN:
1285 v.val = sk->sk_state == TCP_LISTEN;
1286 break;
1da177e4 1287
e71a4783 1288 case SO_PASSSEC:
82981930 1289 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
e71a4783 1290 break;
877ce7c1 1291
e71a4783
SH
1292 case SO_PEERSEC:
1293 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1da177e4 1294
4a19ec58
LAT
1295 case SO_MARK:
1296 v.val = sk->sk_mark;
1297 break;
1298
3b885787 1299 case SO_RXQ_OVFL:
1b23a5df 1300 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
3b885787
NH
1301 break;
1302
6e3e939f 1303 case SO_WIFI_STATUS:
1b23a5df 1304 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
6e3e939f
JB
1305 break;
1306
ef64a54f
PE
1307 case SO_PEEK_OFF:
1308 if (!sock->ops->set_peek_off)
1309 return -EOPNOTSUPP;
1310
1311 v.val = sk->sk_peek_off;
1312 break;
bc2f7996 1313 case SO_NOFCS:
1b23a5df 1314 v.val = sock_flag(sk, SOCK_NOFCS);
bc2f7996 1315 break;
c91f6df2 1316
f7b86bfe 1317 case SO_BINDTODEVICE:
c91f6df2
BH
1318 return sock_getbindtodevice(sk, optval, optlen, len);
1319
a8fc9277
PE
1320 case SO_GET_FILTER:
1321 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1322 if (len < 0)
1323 return len;
1324
1325 goto lenout;
c91f6df2 1326
d59577b6
VB
1327 case SO_LOCK_FILTER:
1328 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1329 break;
1330
ea02f941
MS
1331 case SO_BPF_EXTENSIONS:
1332 v.val = bpf_tell_extensions();
1333 break;
1334
7d4c04fc
KJ
1335 case SO_SELECT_ERR_QUEUE:
1336 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1337 break;
1338
e0d1095a 1339#ifdef CONFIG_NET_RX_BUSY_POLL
64b0dc51 1340 case SO_BUSY_POLL:
dafcc438
ET
1341 v.val = sk->sk_ll_usec;
1342 break;
1343#endif
1344
62748f32 1345 case SO_MAX_PACING_RATE:
76a9ebe8
ED
1346 /* 32bit version */
1347 v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
62748f32
ED
1348 break;
1349
2c8c56e1
ED
1350 case SO_INCOMING_CPU:
1351 v.val = sk->sk_incoming_cpu;
1352 break;
1353
a2d133b1
JH
1354 case SO_MEMINFO:
1355 {
1356 u32 meminfo[SK_MEMINFO_VARS];
1357
1358 if (get_user(len, optlen))
1359 return -EFAULT;
1360
1361 sk_get_meminfo(sk, meminfo);
1362
1363 len = min_t(unsigned int, len, sizeof(meminfo));
1364 if (copy_to_user(optval, &meminfo, len))
1365 return -EFAULT;
1366
1367 goto lenout;
1368 }
6d433902
SS
1369
1370#ifdef CONFIG_NET_RX_BUSY_POLL
1371 case SO_INCOMING_NAPI_ID:
1372 v.val = READ_ONCE(sk->sk_napi_id);
1373
1374 /* aggregate non-NAPI IDs down to 0 */
1375 if (v.val < MIN_NAPI_ID)
1376 v.val = 0;
1377
1378 break;
1379#endif
1380
5daab9db
CF
1381 case SO_COOKIE:
1382 lv = sizeof(u64);
1383 if (len < lv)
1384 return -EINVAL;
1385 v.val64 = sock_gen_cookie(sk);
1386 break;
1387
76851d12
WB
1388 case SO_ZEROCOPY:
1389 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1390 break;
1391
80b14dee
RC
1392 case SO_TXTIME:
1393 lv = sizeof(v.txtime);
1394 v.txtime.clockid = sk->sk_clockid;
1395 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1396 SOF_TXTIME_DEADLINE_MODE : 0;
4b15c707
JSP
1397 v.txtime.flags |= sk->sk_txtime_report_errors ?
1398 SOF_TXTIME_REPORT_ERRORS : 0;
80b14dee
RC
1399 break;
1400
e71a4783 1401 default:
443b5991
YH
1402 /* We implement the SO_SNDLOWAT etc to not be settable
1403 * (1003.1g 7).
1404 */
e71a4783 1405 return -ENOPROTOOPT;
1da177e4 1406 }
e71a4783 1407
1da177e4
LT
1408 if (len > lv)
1409 len = lv;
1410 if (copy_to_user(optval, &v, len))
1411 return -EFAULT;
1412lenout:
4ec93edb
YH
1413 if (put_user(len, optlen))
1414 return -EFAULT;
1415 return 0;
1da177e4
LT
1416}
1417
a5b5bb9a
IM
1418/*
1419 * Initialize an sk_lock.
1420 *
1421 * (We also register the sk_lock with the lock validator.)
1422 */
b6f99a21 1423static inline void sock_lock_init(struct sock *sk)
a5b5bb9a 1424{
cdfbabfb
DH
1425 if (sk->sk_kern_sock)
1426 sock_lock_init_class_and_name(
1427 sk,
1428 af_family_kern_slock_key_strings[sk->sk_family],
1429 af_family_kern_slock_keys + sk->sk_family,
1430 af_family_kern_key_strings[sk->sk_family],
1431 af_family_kern_keys + sk->sk_family);
1432 else
1433 sock_lock_init_class_and_name(
1434 sk,
ed07536e
PZ
1435 af_family_slock_key_strings[sk->sk_family],
1436 af_family_slock_keys + sk->sk_family,
1437 af_family_key_strings[sk->sk_family],
1438 af_family_keys + sk->sk_family);
a5b5bb9a
IM
1439}
1440
4dc6dc71
ED
1441/*
1442 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1443 * even temporarly, because of RCU lookups. sk_node should also be left as is.
68835aba 1444 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
4dc6dc71 1445 */
f1a6c4da
PE
1446static void sock_copy(struct sock *nsk, const struct sock *osk)
1447{
1448#ifdef CONFIG_SECURITY_NETWORK
1449 void *sptr = nsk->sk_security;
1450#endif
68835aba
ED
1451 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1452
1453 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1454 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1455
f1a6c4da
PE
1456#ifdef CONFIG_SECURITY_NETWORK
1457 nsk->sk_security = sptr;
1458 security_sk_clone(osk, nsk);
1459#endif
1460}
1461
2e4afe7b
PE
1462static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1463 int family)
c308c1b2
PE
1464{
1465 struct sock *sk;
1466 struct kmem_cache *slab;
1467
1468 slab = prot->slab;
e912b114
ED
1469 if (slab != NULL) {
1470 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1471 if (!sk)
1472 return sk;
ba2489b0
ED
1473 if (priority & __GFP_ZERO)
1474 sk_prot_clear_nulls(sk, prot->obj_size);
fcbdf09d 1475 } else
c308c1b2
PE
1476 sk = kmalloc(prot->obj_size, priority);
1477
2e4afe7b
PE
1478 if (sk != NULL) {
1479 if (security_sk_alloc(sk, family, priority))
1480 goto out_free;
1481
1482 if (!try_module_get(prot->owner))
1483 goto out_free_sec;
e022f0b4 1484 sk_tx_queue_clear(sk);
2e4afe7b
PE
1485 }
1486
c308c1b2 1487 return sk;
2e4afe7b
PE
1488
1489out_free_sec:
1490 security_sk_free(sk);
1491out_free:
1492 if (slab != NULL)
1493 kmem_cache_free(slab, sk);
1494 else
1495 kfree(sk);
1496 return NULL;
c308c1b2
PE
1497}
1498
1499static void sk_prot_free(struct proto *prot, struct sock *sk)
1500{
1501 struct kmem_cache *slab;
2e4afe7b 1502 struct module *owner;
c308c1b2 1503
2e4afe7b 1504 owner = prot->owner;
c308c1b2 1505 slab = prot->slab;
2e4afe7b 1506
bd1060a1 1507 cgroup_sk_free(&sk->sk_cgrp_data);
2d758073 1508 mem_cgroup_sk_free(sk);
2e4afe7b 1509 security_sk_free(sk);
c308c1b2
PE
1510 if (slab != NULL)
1511 kmem_cache_free(slab, sk);
1512 else
1513 kfree(sk);
2e4afe7b 1514 module_put(owner);
c308c1b2
PE
1515}
1516
1da177e4
LT
1517/**
1518 * sk_alloc - All socket objects are allocated here
c4ea43c5 1519 * @net: the applicable net namespace
4dc3b16b
PP
1520 * @family: protocol family
1521 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1522 * @prot: struct proto associated with this new sock instance
11aa9c28 1523 * @kern: is this to be a kernel socket?
1da177e4 1524 */
1b8d7ae4 1525struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
11aa9c28 1526 struct proto *prot, int kern)
1da177e4 1527{
c308c1b2 1528 struct sock *sk;
1da177e4 1529
154adbc8 1530 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1da177e4 1531 if (sk) {
154adbc8
PE
1532 sk->sk_family = family;
1533 /*
1534 * See comment in struct sock definition to understand
1535 * why we need sk_prot_creator -acme
1536 */
1537 sk->sk_prot = sk->sk_prot_creator = prot;
cdfbabfb 1538 sk->sk_kern_sock = kern;
154adbc8 1539 sock_lock_init(sk);
26abe143 1540 sk->sk_net_refcnt = kern ? 0 : 1;
648845ab 1541 if (likely(sk->sk_net_refcnt)) {
26abe143 1542 get_net(net);
648845ab
TZ
1543 sock_inuse_add(net, 1);
1544 }
1545
26abe143 1546 sock_net_set(sk, net);
14afee4b 1547 refcount_set(&sk->sk_wmem_alloc, 1);
f8451725 1548
2d758073 1549 mem_cgroup_sk_alloc(sk);
d979a39d 1550 cgroup_sk_alloc(&sk->sk_cgrp_data);
2a56a1fe
TH
1551 sock_update_classid(&sk->sk_cgrp_data);
1552 sock_update_netprioidx(&sk->sk_cgrp_data);
1da177e4 1553 }
a79af59e 1554
2e4afe7b 1555 return sk;
1da177e4 1556}
2a91525c 1557EXPORT_SYMBOL(sk_alloc);
1da177e4 1558
a4298e45
ED
1559/* Sockets having SOCK_RCU_FREE will call this function after one RCU
1560 * grace period. This is the case for UDP sockets and TCP listeners.
1561 */
1562static void __sk_destruct(struct rcu_head *head)
1da177e4 1563{
a4298e45 1564 struct sock *sk = container_of(head, struct sock, sk_rcu);
1da177e4 1565 struct sk_filter *filter;
1da177e4
LT
1566
1567 if (sk->sk_destruct)
1568 sk->sk_destruct(sk);
1569
a898def2 1570 filter = rcu_dereference_check(sk->sk_filter,
14afee4b 1571 refcount_read(&sk->sk_wmem_alloc) == 0);
1da177e4 1572 if (filter) {
309dd5fc 1573 sk_filter_uncharge(sk, filter);
a9b3cd7f 1574 RCU_INIT_POINTER(sk->sk_filter, NULL);
1da177e4 1575 }
538950a1
CG
1576 if (rcu_access_pointer(sk->sk_reuseport_cb))
1577 reuseport_detach_sock(sk);
1da177e4 1578
08e29af3 1579 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1da177e4
LT
1580
1581 if (atomic_read(&sk->sk_omem_alloc))
e005d193
JP
1582 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1583 __func__, atomic_read(&sk->sk_omem_alloc));
1da177e4 1584
22a0e18e
ED
1585 if (sk->sk_frag.page) {
1586 put_page(sk->sk_frag.page);
1587 sk->sk_frag.page = NULL;
1588 }
1589
109f6e39
EB
1590 if (sk->sk_peer_cred)
1591 put_cred(sk->sk_peer_cred);
1592 put_pid(sk->sk_peer_pid);
26abe143
EB
1593 if (likely(sk->sk_net_refcnt))
1594 put_net(sock_net(sk));
c308c1b2 1595 sk_prot_free(sk->sk_prot_creator, sk);
1da177e4 1596}
2b85a34e 1597
a4298e45
ED
1598void sk_destruct(struct sock *sk)
1599{
1600 if (sock_flag(sk, SOCK_RCU_FREE))
1601 call_rcu(&sk->sk_rcu, __sk_destruct);
1602 else
1603 __sk_destruct(&sk->sk_rcu);
1604}
1605
eb4cb008
CG
1606static void __sk_free(struct sock *sk)
1607{
648845ab
TZ
1608 if (likely(sk->sk_net_refcnt))
1609 sock_inuse_add(sock_net(sk), -1);
1610
9709020c 1611 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
eb4cb008
CG
1612 sock_diag_broadcast_destroy(sk);
1613 else
1614 sk_destruct(sk);
1615}
1616
2b85a34e
ED
1617void sk_free(struct sock *sk)
1618{
1619 /*
25985edc 1620 * We subtract one from sk_wmem_alloc and can know if
2b85a34e
ED
1621 * some packets are still in some tx queue.
1622 * If not null, sock_wfree() will call __sk_free(sk) later
1623 */
14afee4b 1624 if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2b85a34e
ED
1625 __sk_free(sk);
1626}
2a91525c 1627EXPORT_SYMBOL(sk_free);
1da177e4 1628
581319c5
PA
1629static void sk_init_common(struct sock *sk)
1630{
1631 skb_queue_head_init(&sk->sk_receive_queue);
1632 skb_queue_head_init(&sk->sk_write_queue);
1633 skb_queue_head_init(&sk->sk_error_queue);
1634
1635 rwlock_init(&sk->sk_callback_lock);
1636 lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1637 af_rlock_keys + sk->sk_family,
1638 af_family_rlock_key_strings[sk->sk_family]);
1639 lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1640 af_wlock_keys + sk->sk_family,
1641 af_family_wlock_key_strings[sk->sk_family]);
1642 lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1643 af_elock_keys + sk->sk_family,
1644 af_family_elock_key_strings[sk->sk_family]);
1645 lockdep_set_class_and_name(&sk->sk_callback_lock,
1646 af_callback_keys + sk->sk_family,
1647 af_family_clock_key_strings[sk->sk_family]);
1648}
1649
e56c57d0
ED
1650/**
1651 * sk_clone_lock - clone a socket, and lock its clone
1652 * @sk: the socket to clone
1653 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1654 *
1655 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1656 */
1657struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
87d11ceb 1658{
8fd1d178 1659 struct sock *newsk;
278571ba 1660 bool is_charged = true;
87d11ceb 1661
8fd1d178 1662 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
87d11ceb
ACM
1663 if (newsk != NULL) {
1664 struct sk_filter *filter;
1665
892c141e 1666 sock_copy(newsk, sk);
87d11ceb 1667
9d538fa6
CP
1668 newsk->sk_prot_creator = sk->sk_prot;
1669
87d11ceb 1670 /* SANITY */
8a681736
SV
1671 if (likely(newsk->sk_net_refcnt))
1672 get_net(sock_net(newsk));
87d11ceb
ACM
1673 sk_node_init(&newsk->sk_node);
1674 sock_lock_init(newsk);
1675 bh_lock_sock(newsk);
fa438ccf 1676 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
8eae939f 1677 newsk->sk_backlog.len = 0;
87d11ceb
ACM
1678
1679 atomic_set(&newsk->sk_rmem_alloc, 0);
2b85a34e
ED
1680 /*
1681 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1682 */
14afee4b 1683 refcount_set(&newsk->sk_wmem_alloc, 1);
87d11ceb 1684 atomic_set(&newsk->sk_omem_alloc, 0);
581319c5 1685 sk_init_common(newsk);
87d11ceb
ACM
1686
1687 newsk->sk_dst_cache = NULL;
9b8805a3 1688 newsk->sk_dst_pending_confirm = 0;
87d11ceb
ACM
1689 newsk->sk_wmem_queued = 0;
1690 newsk->sk_forward_alloc = 0;
9caad864 1691 atomic_set(&newsk->sk_drops, 0);
87d11ceb 1692 newsk->sk_send_head = NULL;
87d11ceb 1693 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
52267790 1694 atomic_set(&newsk->sk_zckey, 0);
87d11ceb
ACM
1695
1696 sock_reset_flag(newsk, SOCK_DONE);
edbe69ef 1697 mem_cgroup_sk_alloc(newsk);
c0576e39 1698 cgroup_sk_alloc(&newsk->sk_cgrp_data);
87d11ceb 1699
eefca20e
ED
1700 rcu_read_lock();
1701 filter = rcu_dereference(sk->sk_filter);
87d11ceb 1702 if (filter != NULL)
278571ba
AS
1703 /* though it's an empty new sock, the charging may fail
1704 * if sysctl_optmem_max was changed between creation of
1705 * original socket and cloning
1706 */
1707 is_charged = sk_filter_charge(newsk, filter);
eefca20e
ED
1708 RCU_INIT_POINTER(newsk->sk_filter, filter);
1709 rcu_read_unlock();
87d11ceb 1710
d188ba86 1711 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
a97e50cc
DB
1712 /* We need to make sure that we don't uncharge the new
1713 * socket if we couldn't charge it in the first place
1714 * as otherwise we uncharge the parent's filter.
1715 */
1716 if (!is_charged)
1717 RCU_INIT_POINTER(newsk->sk_filter, NULL);
94352d45 1718 sk_free_unlock_clone(newsk);
87d11ceb
ACM
1719 newsk = NULL;
1720 goto out;
1721 }
fa463497 1722 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
87d11ceb
ACM
1723
1724 newsk->sk_err = 0;
e551c32d 1725 newsk->sk_err_soft = 0;
87d11ceb 1726 newsk->sk_priority = 0;
2c8c56e1 1727 newsk->sk_incoming_cpu = raw_smp_processor_id();
33cf7c90 1728 atomic64_set(&newsk->sk_cookie, 0);
648845ab
TZ
1729 if (likely(newsk->sk_net_refcnt))
1730 sock_inuse_add(sock_net(newsk), 1);
d979a39d 1731
4dc6dc71
ED
1732 /*
1733 * Before updating sk_refcnt, we must commit prior changes to memory
1734 * (Documentation/RCU/rculist_nulls.txt for details)
1735 */
1736 smp_wmb();
41c6d650 1737 refcount_set(&newsk->sk_refcnt, 2);
87d11ceb
ACM
1738
1739 /*
1740 * Increment the counter in the same struct proto as the master
1741 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1742 * is the same as sk->sk_prot->socks, as this field was copied
1743 * with memcpy).
1744 *
1745 * This _changes_ the previous behaviour, where
1746 * tcp_create_openreq_child always was incrementing the
1747 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1748 * to be taken into account in all callers. -acme
1749 */
1750 sk_refcnt_debug_inc(newsk);
972692e0 1751 sk_set_socket(newsk, NULL);
43815482 1752 newsk->sk_wq = NULL;
87d11ceb
ACM
1753
1754 if (newsk->sk_prot->sockets_allocated)
180d8cd9 1755 sk_sockets_allocated_inc(newsk);
704da560 1756
080a270f
HFS
1757 if (sock_needs_netstamp(sk) &&
1758 newsk->sk_flags & SK_FLAGS_TIMESTAMP)
704da560 1759 net_enable_timestamp();
87d11ceb
ACM
1760 }
1761out:
1762 return newsk;
1763}
e56c57d0 1764EXPORT_SYMBOL_GPL(sk_clone_lock);
87d11ceb 1765
94352d45
ACM
1766void sk_free_unlock_clone(struct sock *sk)
1767{
1768 /* It is still raw copy of parent, so invalidate
1769 * destructor and make plain sk_free() */
1770 sk->sk_destruct = NULL;
1771 bh_unlock_sock(sk);
1772 sk_free(sk);
1773}
1774EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1775
9958089a
AK
1776void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1777{
d6a4e26a
ED
1778 u32 max_segs = 1;
1779
6bd4f355 1780 sk_dst_set(sk, dst);
0a6b2a1d 1781 sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
9958089a 1782 if (sk->sk_route_caps & NETIF_F_GSO)
4fcd6b99 1783 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
a465419b 1784 sk->sk_route_caps &= ~sk->sk_route_nocaps;
9958089a 1785 if (sk_can_gso(sk)) {
f70f250a 1786 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
9958089a 1787 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
82cc1a7a 1788 } else {
9958089a 1789 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
82cc1a7a 1790 sk->sk_gso_max_size = dst->dev->gso_max_size;
d6a4e26a 1791 max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
82cc1a7a 1792 }
9958089a 1793 }
d6a4e26a 1794 sk->sk_gso_max_segs = max_segs;
9958089a
AK
1795}
1796EXPORT_SYMBOL_GPL(sk_setup_caps);
1797
1da177e4
LT
1798/*
1799 * Simple resource managers for sockets.
1800 */
1801
1802
4ec93edb
YH
1803/*
1804 * Write buffer destructor automatically called from kfree_skb.
1da177e4
LT
1805 */
1806void sock_wfree(struct sk_buff *skb)
1807{
1808 struct sock *sk = skb->sk;
d99927f4 1809 unsigned int len = skb->truesize;
1da177e4 1810
d99927f4
ED
1811 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1812 /*
1813 * Keep a reference on sk_wmem_alloc, this will be released
1814 * after sk_write_space() call
1815 */
14afee4b 1816 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1da177e4 1817 sk->sk_write_space(sk);
d99927f4
ED
1818 len = 1;
1819 }
2b85a34e 1820 /*
d99927f4
ED
1821 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1822 * could not do because of in-flight packets
2b85a34e 1823 */
14afee4b 1824 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2b85a34e 1825 __sk_free(sk);
1da177e4 1826}
2a91525c 1827EXPORT_SYMBOL(sock_wfree);
1da177e4 1828
1d2077ac
ED
1829/* This variant of sock_wfree() is used by TCP,
1830 * since it sets SOCK_USE_WRITE_QUEUE.
1831 */
1832void __sock_wfree(struct sk_buff *skb)
1833{
1834 struct sock *sk = skb->sk;
1835
14afee4b 1836 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1d2077ac
ED
1837 __sk_free(sk);
1838}
1839
9e17f8a4
ED
1840void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1841{
1842 skb_orphan(skb);
1843 skb->sk = sk;
1844#ifdef CONFIG_INET
1845 if (unlikely(!sk_fullsock(sk))) {
1846 skb->destructor = sock_edemux;
1847 sock_hold(sk);
1848 return;
1849 }
1850#endif
1851 skb->destructor = sock_wfree;
1852 skb_set_hash_from_sk(skb, sk);
1853 /*
1854 * We used to take a refcount on sk, but following operation
1855 * is enough to guarantee sk_free() wont free this sock until
1856 * all in-flight packets are completed
1857 */
14afee4b 1858 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
9e17f8a4
ED
1859}
1860EXPORT_SYMBOL(skb_set_owner_w);
1861
1d2077ac
ED
1862/* This helper is used by netem, as it can hold packets in its
1863 * delay queue. We want to allow the owner socket to send more
1864 * packets, as if they were already TX completed by a typical driver.
1865 * But we also want to keep skb->sk set because some packet schedulers
f6ba8d33 1866 * rely on it (sch_fq for example).
1d2077ac 1867 */
f2f872f9
ED
1868void skb_orphan_partial(struct sk_buff *skb)
1869{
f6ba8d33 1870 if (skb_is_tcp_pure_ack(skb))
1d2077ac
ED
1871 return;
1872
f2f872f9
ED
1873 if (skb->destructor == sock_wfree
1874#ifdef CONFIG_INET
1875 || skb->destructor == tcp_wfree
1876#endif
1877 ) {
f6ba8d33
ED
1878 struct sock *sk = skb->sk;
1879
41c6d650 1880 if (refcount_inc_not_zero(&sk->sk_refcnt)) {
14afee4b 1881 WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
f6ba8d33
ED
1882 skb->destructor = sock_efree;
1883 }
f2f872f9
ED
1884 } else {
1885 skb_orphan(skb);
1886 }
1887}
1888EXPORT_SYMBOL(skb_orphan_partial);
1889
4ec93edb
YH
1890/*
1891 * Read buffer destructor automatically called from kfree_skb.
1da177e4
LT
1892 */
1893void sock_rfree(struct sk_buff *skb)
1894{
1895 struct sock *sk = skb->sk;
d361fd59 1896 unsigned int len = skb->truesize;
1da177e4 1897
d361fd59
ED
1898 atomic_sub(len, &sk->sk_rmem_alloc);
1899 sk_mem_uncharge(sk, len);
1da177e4 1900}
2a91525c 1901EXPORT_SYMBOL(sock_rfree);
1da177e4 1902
7768eed8
OH
1903/*
1904 * Buffer destructor for skbs that are not used directly in read or write
1905 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1906 */
62bccb8c
AD
1907void sock_efree(struct sk_buff *skb)
1908{
1909 sock_put(skb->sk);
1910}
1911EXPORT_SYMBOL(sock_efree);
1912
976d0201 1913kuid_t sock_i_uid(struct sock *sk)
1da177e4 1914{
976d0201 1915 kuid_t uid;
1da177e4 1916
f064af1e 1917 read_lock_bh(&sk->sk_callback_lock);
976d0201 1918 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
f064af1e 1919 read_unlock_bh(&sk->sk_callback_lock);
1da177e4
LT
1920 return uid;
1921}
2a91525c 1922EXPORT_SYMBOL(sock_i_uid);
1da177e4
LT
1923
1924unsigned long sock_i_ino(struct sock *sk)
1925{
1926 unsigned long ino;
1927
f064af1e 1928 read_lock_bh(&sk->sk_callback_lock);
1da177e4 1929 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
f064af1e 1930 read_unlock_bh(&sk->sk_callback_lock);
1da177e4
LT
1931 return ino;
1932}
2a91525c 1933EXPORT_SYMBOL(sock_i_ino);
1da177e4
LT
1934
1935/*
1936 * Allocate a skb from the socket's send buffer.
1937 */
86a76caf 1938struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
dd0fc66f 1939 gfp_t priority)
1da177e4 1940{
14afee4b 1941 if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
2a91525c 1942 struct sk_buff *skb = alloc_skb(size, priority);
1da177e4
LT
1943 if (skb) {
1944 skb_set_owner_w(skb, sk);
1945 return skb;
1946 }
1947 }
1948 return NULL;
1949}
2a91525c 1950EXPORT_SYMBOL(sock_wmalloc);
1da177e4 1951
98ba0bd5
WB
1952static void sock_ofree(struct sk_buff *skb)
1953{
1954 struct sock *sk = skb->sk;
1955
1956 atomic_sub(skb->truesize, &sk->sk_omem_alloc);
1957}
1958
1959struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
1960 gfp_t priority)
1961{
1962 struct sk_buff *skb;
1963
1964 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
1965 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
1966 sysctl_optmem_max)
1967 return NULL;
1968
1969 skb = alloc_skb(size, priority);
1970 if (!skb)
1971 return NULL;
1972
1973 atomic_add(skb->truesize, &sk->sk_omem_alloc);
1974 skb->sk = sk;
1975 skb->destructor = sock_ofree;
1976 return skb;
1977}
1978
4ec93edb 1979/*
1da177e4 1980 * Allocate a memory block from the socket's option memory buffer.
4ec93edb 1981 */
dd0fc66f 1982void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1da177e4 1983{
95c96174 1984 if ((unsigned int)size <= sysctl_optmem_max &&
1da177e4
LT
1985 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1986 void *mem;
1987 /* First do the add, to avoid the race if kmalloc
4ec93edb 1988 * might sleep.
1da177e4
LT
1989 */
1990 atomic_add(size, &sk->sk_omem_alloc);
1991 mem = kmalloc(size, priority);
1992 if (mem)
1993 return mem;
1994 atomic_sub(size, &sk->sk_omem_alloc);
1995 }
1996 return NULL;
1997}
2a91525c 1998EXPORT_SYMBOL(sock_kmalloc);
1da177e4 1999
79e88659
DB
2000/* Free an option memory block. Note, we actually want the inline
2001 * here as this allows gcc to detect the nullify and fold away the
2002 * condition entirely.
1da177e4 2003 */
79e88659
DB
2004static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2005 const bool nullify)
1da177e4 2006{
e53da5fb
DM
2007 if (WARN_ON_ONCE(!mem))
2008 return;
79e88659
DB
2009 if (nullify)
2010 kzfree(mem);
2011 else
2012 kfree(mem);
1da177e4
LT
2013 atomic_sub(size, &sk->sk_omem_alloc);
2014}
79e88659
DB
2015
2016void sock_kfree_s(struct sock *sk, void *mem, int size)
2017{
2018 __sock_kfree_s(sk, mem, size, false);
2019}
2a91525c 2020EXPORT_SYMBOL(sock_kfree_s);
1da177e4 2021
79e88659
DB
2022void sock_kzfree_s(struct sock *sk, void *mem, int size)
2023{
2024 __sock_kfree_s(sk, mem, size, true);
2025}
2026EXPORT_SYMBOL(sock_kzfree_s);
2027
1da177e4
LT
2028/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2029 I think, these locks should be removed for datagram sockets.
2030 */
2a91525c 2031static long sock_wait_for_wmem(struct sock *sk, long timeo)
1da177e4
LT
2032{
2033 DEFINE_WAIT(wait);
2034
9cd3e072 2035 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1da177e4
LT
2036 for (;;) {
2037 if (!timeo)
2038 break;
2039 if (signal_pending(current))
2040 break;
2041 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
aa395145 2042 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
14afee4b 2043 if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1da177e4
LT
2044 break;
2045 if (sk->sk_shutdown & SEND_SHUTDOWN)
2046 break;
2047 if (sk->sk_err)
2048 break;
2049 timeo = schedule_timeout(timeo);
2050 }
aa395145 2051 finish_wait(sk_sleep(sk), &wait);
1da177e4
LT
2052 return timeo;
2053}
2054
2055
2056/*
2057 * Generic send/receive buffer handlers
2058 */
2059
4cc7f68d
HX
2060struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2061 unsigned long data_len, int noblock,
28d64271 2062 int *errcode, int max_page_order)
1da177e4 2063{
2e4e4410 2064 struct sk_buff *skb;
1da177e4
LT
2065 long timeo;
2066 int err;
2067
1da177e4 2068 timeo = sock_sndtimeo(sk, noblock);
2e4e4410 2069 for (;;) {
1da177e4
LT
2070 err = sock_error(sk);
2071 if (err != 0)
2072 goto failure;
2073
2074 err = -EPIPE;
2075 if (sk->sk_shutdown & SEND_SHUTDOWN)
2076 goto failure;
2077
2e4e4410
ED
2078 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2079 break;
28d64271 2080
9cd3e072 2081 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2e4e4410
ED
2082 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2083 err = -EAGAIN;
2084 if (!timeo)
1da177e4 2085 goto failure;
2e4e4410
ED
2086 if (signal_pending(current))
2087 goto interrupted;
2088 timeo = sock_wait_for_wmem(sk, timeo);
1da177e4 2089 }
2e4e4410
ED
2090 skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2091 errcode, sk->sk_allocation);
2092 if (skb)
2093 skb_set_owner_w(skb, sk);
1da177e4
LT
2094 return skb;
2095
2096interrupted:
2097 err = sock_intr_errno(timeo);
2098failure:
2099 *errcode = err;
2100 return NULL;
2101}
4cc7f68d 2102EXPORT_SYMBOL(sock_alloc_send_pskb);
1da177e4 2103
4ec93edb 2104struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1da177e4
LT
2105 int noblock, int *errcode)
2106{
28d64271 2107 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1da177e4 2108}
2a91525c 2109EXPORT_SYMBOL(sock_alloc_send_skb);
1da177e4 2110
39771b12
WB
2111int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2112 struct sockcm_cookie *sockc)
2113{
3dd17e63
SHY
2114 u32 tsflags;
2115
39771b12
WB
2116 switch (cmsg->cmsg_type) {
2117 case SO_MARK:
2118 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2119 return -EPERM;
2120 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2121 return -EINVAL;
2122 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2123 break;
3dd17e63
SHY
2124 case SO_TIMESTAMPING:
2125 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2126 return -EINVAL;
2127
2128 tsflags = *(u32 *)CMSG_DATA(cmsg);
2129 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2130 return -EINVAL;
2131
2132 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2133 sockc->tsflags |= tsflags;
2134 break;
80b14dee
RC
2135 case SCM_TXTIME:
2136 if (!sock_flag(sk, SOCK_TXTIME))
2137 return -EINVAL;
2138 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2139 return -EINVAL;
2140 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2141 break;
779f1ede
SHY
2142 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2143 case SCM_RIGHTS:
2144 case SCM_CREDENTIALS:
2145 break;
39771b12
WB
2146 default:
2147 return -EINVAL;
2148 }
2149 return 0;
2150}
2151EXPORT_SYMBOL(__sock_cmsg_send);
2152
f28ea365
EJ
2153int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2154 struct sockcm_cookie *sockc)
2155{
2156 struct cmsghdr *cmsg;
39771b12 2157 int ret;
f28ea365
EJ
2158
2159 for_each_cmsghdr(cmsg, msg) {
2160 if (!CMSG_OK(msg, cmsg))
2161 return -EINVAL;
2162 if (cmsg->cmsg_level != SOL_SOCKET)
2163 continue;
39771b12
WB
2164 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2165 if (ret)
2166 return ret;
f28ea365
EJ
2167 }
2168 return 0;
2169}
2170EXPORT_SYMBOL(sock_cmsg_send);
2171
06044751
ED
2172static void sk_enter_memory_pressure(struct sock *sk)
2173{
2174 if (!sk->sk_prot->enter_memory_pressure)
2175 return;
2176
2177 sk->sk_prot->enter_memory_pressure(sk);
2178}
2179
2180static void sk_leave_memory_pressure(struct sock *sk)
2181{
2182 if (sk->sk_prot->leave_memory_pressure) {
2183 sk->sk_prot->leave_memory_pressure(sk);
2184 } else {
2185 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2186
2187 if (memory_pressure && *memory_pressure)
2188 *memory_pressure = 0;
2189 }
2190}
2191
5640f768
ED
2192/* On 32bit arches, an skb frag is limited to 2^15 */
2193#define SKB_FRAG_PAGE_ORDER get_order(32768)
2194
400dfd3a
ED
2195/**
2196 * skb_page_frag_refill - check that a page_frag contains enough room
2197 * @sz: minimum size of the fragment we want to get
2198 * @pfrag: pointer to page_frag
82d5e2b8 2199 * @gfp: priority for memory allocation
400dfd3a
ED
2200 *
2201 * Note: While this allocator tries to use high order pages, there is
2202 * no guarantee that allocations succeed. Therefore, @sz MUST be
2203 * less or equal than PAGE_SIZE.
2204 */
d9b2938a 2205bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
5640f768 2206{
5640f768 2207 if (pfrag->page) {
fe896d18 2208 if (page_ref_count(pfrag->page) == 1) {
5640f768
ED
2209 pfrag->offset = 0;
2210 return true;
2211 }
400dfd3a 2212 if (pfrag->offset + sz <= pfrag->size)
5640f768
ED
2213 return true;
2214 put_page(pfrag->page);
2215 }
2216
d9b2938a
ED
2217 pfrag->offset = 0;
2218 if (SKB_FRAG_PAGE_ORDER) {
d0164adc
MG
2219 /* Avoid direct reclaim but allow kswapd to wake */
2220 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2221 __GFP_COMP | __GFP_NOWARN |
2222 __GFP_NORETRY,
d9b2938a 2223 SKB_FRAG_PAGE_ORDER);
5640f768 2224 if (likely(pfrag->page)) {
d9b2938a 2225 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
5640f768
ED
2226 return true;
2227 }
d9b2938a
ED
2228 }
2229 pfrag->page = alloc_page(gfp);
2230 if (likely(pfrag->page)) {
2231 pfrag->size = PAGE_SIZE;
2232 return true;
2233 }
400dfd3a
ED
2234 return false;
2235}
2236EXPORT_SYMBOL(skb_page_frag_refill);
2237
2238bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2239{
2240 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2241 return true;
2242
5640f768
ED
2243 sk_enter_memory_pressure(sk);
2244 sk_stream_moderate_sndbuf(sk);
2245 return false;
2246}
2247EXPORT_SYMBOL(sk_page_frag_refill);
2248
1da177e4 2249static void __lock_sock(struct sock *sk)
f39234d6
NK
2250 __releases(&sk->sk_lock.slock)
2251 __acquires(&sk->sk_lock.slock)
1da177e4
LT
2252{
2253 DEFINE_WAIT(wait);
2254
e71a4783 2255 for (;;) {
1da177e4
LT
2256 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2257 TASK_UNINTERRUPTIBLE);
2258 spin_unlock_bh(&sk->sk_lock.slock);
2259 schedule();
2260 spin_lock_bh(&sk->sk_lock.slock);
e71a4783 2261 if (!sock_owned_by_user(sk))
1da177e4
LT
2262 break;
2263 }
2264 finish_wait(&sk->sk_lock.wq, &wait);
2265}
2266
8873c064 2267void __release_sock(struct sock *sk)
f39234d6
NK
2268 __releases(&sk->sk_lock.slock)
2269 __acquires(&sk->sk_lock.slock)
1da177e4 2270{
5413d1ba 2271 struct sk_buff *skb, *next;
1da177e4 2272
5413d1ba 2273 while ((skb = sk->sk_backlog.head) != NULL) {
1da177e4 2274 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1da177e4 2275
5413d1ba 2276 spin_unlock_bh(&sk->sk_lock.slock);
1da177e4 2277
5413d1ba
ED
2278 do {
2279 next = skb->next;
e4cbb02a 2280 prefetch(next);
7fee226a 2281 WARN_ON_ONCE(skb_dst_is_noref(skb));
a8305bff 2282 skb_mark_not_on_list(skb);
c57943a1 2283 sk_backlog_rcv(sk, skb);
1da177e4 2284
5413d1ba 2285 cond_resched();
1da177e4
LT
2286
2287 skb = next;
2288 } while (skb != NULL);
2289
5413d1ba
ED
2290 spin_lock_bh(&sk->sk_lock.slock);
2291 }
8eae939f
ZY
2292
2293 /*
2294 * Doing the zeroing here guarantee we can not loop forever
2295 * while a wild producer attempts to flood us.
2296 */
2297 sk->sk_backlog.len = 0;
1da177e4
LT
2298}
2299
d41a69f1
ED
2300void __sk_flush_backlog(struct sock *sk)
2301{
2302 spin_lock_bh(&sk->sk_lock.slock);
2303 __release_sock(sk);
2304 spin_unlock_bh(&sk->sk_lock.slock);
2305}
2306
1da177e4
LT
2307/**
2308 * sk_wait_data - wait for data to arrive at sk_receive_queue
4dc3b16b
PP
2309 * @sk: sock to wait on
2310 * @timeo: for how long
dfbafc99 2311 * @skb: last skb seen on sk_receive_queue
1da177e4
LT
2312 *
2313 * Now socket state including sk->sk_err is changed only under lock,
2314 * hence we may omit checks after joining wait queue.
2315 * We check receive queue before schedule() only as optimization;
2316 * it is very likely that release_sock() added new data.
2317 */
dfbafc99 2318int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
1da177e4 2319{
d9dc8b0f 2320 DEFINE_WAIT_FUNC(wait, woken_wake_function);
1da177e4 2321 int rc;
1da177e4 2322
d9dc8b0f 2323 add_wait_queue(sk_sleep(sk), &wait);
9cd3e072 2324 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
d9dc8b0f 2325 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
9cd3e072 2326 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
d9dc8b0f 2327 remove_wait_queue(sk_sleep(sk), &wait);
1da177e4
LT
2328 return rc;
2329}
1da177e4
LT
2330EXPORT_SYMBOL(sk_wait_data);
2331
3ab224be 2332/**
f8c3bf00 2333 * __sk_mem_raise_allocated - increase memory_allocated
3ab224be
HA
2334 * @sk: socket
2335 * @size: memory size to allocate
f8c3bf00 2336 * @amt: pages to allocate
3ab224be
HA
2337 * @kind: allocation type
2338 *
f8c3bf00 2339 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
3ab224be 2340 */
f8c3bf00 2341int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3ab224be
HA
2342{
2343 struct proto *prot = sk->sk_prot;
f8c3bf00 2344 long allocated = sk_memory_allocated_add(sk, amt);
d6f19938 2345 bool charged = true;
e805605c 2346
baac50bb 2347 if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
d6f19938 2348 !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
e805605c 2349 goto suppress_allocation;
3ab224be
HA
2350
2351 /* Under limit. */
e805605c 2352 if (allocated <= sk_prot_mem_limits(sk, 0)) {
180d8cd9 2353 sk_leave_memory_pressure(sk);
3ab224be
HA
2354 return 1;
2355 }
2356
e805605c
JW
2357 /* Under pressure. */
2358 if (allocated > sk_prot_mem_limits(sk, 1))
180d8cd9 2359 sk_enter_memory_pressure(sk);
3ab224be 2360
e805605c
JW
2361 /* Over hard limit. */
2362 if (allocated > sk_prot_mem_limits(sk, 2))
3ab224be
HA
2363 goto suppress_allocation;
2364
2365 /* guarantee minimum buffer size under pressure */
2366 if (kind == SK_MEM_RECV) {
a3dcaf17 2367 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3ab224be 2368 return 1;
180d8cd9 2369
3ab224be 2370 } else { /* SK_MEM_SEND */
a3dcaf17
ED
2371 int wmem0 = sk_get_wmem0(sk, prot);
2372
3ab224be 2373 if (sk->sk_type == SOCK_STREAM) {
a3dcaf17 2374 if (sk->sk_wmem_queued < wmem0)
3ab224be 2375 return 1;
a3dcaf17 2376 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3ab224be 2377 return 1;
a3dcaf17 2378 }
3ab224be
HA
2379 }
2380
180d8cd9 2381 if (sk_has_memory_pressure(sk)) {
1748376b
ED
2382 int alloc;
2383
180d8cd9 2384 if (!sk_under_memory_pressure(sk))
1748376b 2385 return 1;
180d8cd9
GC
2386 alloc = sk_sockets_allocated_read_positive(sk);
2387 if (sk_prot_mem_limits(sk, 2) > alloc *
3ab224be
HA
2388 sk_mem_pages(sk->sk_wmem_queued +
2389 atomic_read(&sk->sk_rmem_alloc) +
2390 sk->sk_forward_alloc))
2391 return 1;
2392 }
2393
2394suppress_allocation:
2395
2396 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2397 sk_stream_moderate_sndbuf(sk);
2398
2399 /* Fail only if socket is _under_ its sndbuf.
2400 * In this case we cannot block, so that we have to fail.
2401 */
2402 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2403 return 1;
2404 }
2405
d6f19938
YS
2406 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2407 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3847ce32 2408
0e90b31f 2409 sk_memory_allocated_sub(sk, amt);
180d8cd9 2410
baac50bb
JW
2411 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2412 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
e805605c 2413
3ab224be
HA
2414 return 0;
2415}
f8c3bf00
PA
2416EXPORT_SYMBOL(__sk_mem_raise_allocated);
2417
2418/**
2419 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2420 * @sk: socket
2421 * @size: memory size to allocate
2422 * @kind: allocation type
2423 *
2424 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2425 * rmem allocation. This function assumes that protocols which have
2426 * memory_pressure use sk_wmem_queued as write buffer accounting.
2427 */
2428int __sk_mem_schedule(struct sock *sk, int size, int kind)
2429{
2430 int ret, amt = sk_mem_pages(size);
2431
2432 sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2433 ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2434 if (!ret)
2435 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2436 return ret;
2437}
3ab224be
HA
2438EXPORT_SYMBOL(__sk_mem_schedule);
2439
2440/**
f8c3bf00 2441 * __sk_mem_reduce_allocated - reclaim memory_allocated
3ab224be 2442 * @sk: socket
f8c3bf00
PA
2443 * @amount: number of quanta
2444 *
2445 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3ab224be 2446 */
f8c3bf00 2447void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3ab224be 2448{
1a24e04e 2449 sk_memory_allocated_sub(sk, amount);
3ab224be 2450
baac50bb
JW
2451 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2452 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
e805605c 2453
180d8cd9
GC
2454 if (sk_under_memory_pressure(sk) &&
2455 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2456 sk_leave_memory_pressure(sk);
3ab224be 2457}
f8c3bf00
PA
2458EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2459
2460/**
2461 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2462 * @sk: socket
2463 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2464 */
2465void __sk_mem_reclaim(struct sock *sk, int amount)
2466{
2467 amount >>= SK_MEM_QUANTUM_SHIFT;
2468 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2469 __sk_mem_reduce_allocated(sk, amount);
2470}
3ab224be
HA
2471EXPORT_SYMBOL(__sk_mem_reclaim);
2472
627d2d6b 2473int sk_set_peek_off(struct sock *sk, int val)
2474{
627d2d6b 2475 sk->sk_peek_off = val;
2476 return 0;
2477}
2478EXPORT_SYMBOL_GPL(sk_set_peek_off);
3ab224be 2479
1da177e4
LT
2480/*
2481 * Set of default routines for initialising struct proto_ops when
2482 * the protocol does not support a particular function. In certain
2483 * cases where it makes no sense for a protocol to have a "do nothing"
2484 * function, some default processing is provided.
2485 */
2486
2487int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2488{
2489 return -EOPNOTSUPP;
2490}
2a91525c 2491EXPORT_SYMBOL(sock_no_bind);
1da177e4 2492
4ec93edb 2493int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1da177e4
LT
2494 int len, int flags)
2495{
2496 return -EOPNOTSUPP;
2497}
2a91525c 2498EXPORT_SYMBOL(sock_no_connect);
1da177e4
LT
2499
2500int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2501{
2502 return -EOPNOTSUPP;
2503}
2a91525c 2504EXPORT_SYMBOL(sock_no_socketpair);
1da177e4 2505
cdfbabfb
DH
2506int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2507 bool kern)
1da177e4
LT
2508{
2509 return -EOPNOTSUPP;
2510}
2a91525c 2511EXPORT_SYMBOL(sock_no_accept);
1da177e4 2512
4ec93edb 2513int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
9b2c45d4 2514 int peer)
1da177e4
LT
2515{
2516 return -EOPNOTSUPP;
2517}
2a91525c 2518EXPORT_SYMBOL(sock_no_getname);
1da177e4 2519
1da177e4
LT
2520int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2521{
2522 return -EOPNOTSUPP;
2523}
2a91525c 2524EXPORT_SYMBOL(sock_no_ioctl);
1da177e4
LT
2525
2526int sock_no_listen(struct socket *sock, int backlog)
2527{
2528 return -EOPNOTSUPP;
2529}
2a91525c 2530EXPORT_SYMBOL(sock_no_listen);
1da177e4
LT
2531
2532int sock_no_shutdown(struct socket *sock, int how)
2533{
2534 return -EOPNOTSUPP;
2535}
2a91525c 2536EXPORT_SYMBOL(sock_no_shutdown);
1da177e4
LT
2537
2538int sock_no_setsockopt(struct socket *sock, int level, int optname,
b7058842 2539 char __user *optval, unsigned int optlen)
1da177e4
LT
2540{
2541 return -EOPNOTSUPP;
2542}
2a91525c 2543EXPORT_SYMBOL(sock_no_setsockopt);
1da177e4
LT
2544
2545int sock_no_getsockopt(struct socket *sock, int level, int optname,
2546 char __user *optval, int __user *optlen)
2547{
2548 return -EOPNOTSUPP;
2549}
2a91525c 2550EXPORT_SYMBOL(sock_no_getsockopt);
1da177e4 2551
1b784140 2552int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
1da177e4
LT
2553{
2554 return -EOPNOTSUPP;
2555}
2a91525c 2556EXPORT_SYMBOL(sock_no_sendmsg);
1da177e4 2557
306b13eb
TH
2558int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2559{
2560 return -EOPNOTSUPP;
2561}
2562EXPORT_SYMBOL(sock_no_sendmsg_locked);
2563
1b784140
YX
2564int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2565 int flags)
1da177e4
LT
2566{
2567 return -EOPNOTSUPP;
2568}
2a91525c 2569EXPORT_SYMBOL(sock_no_recvmsg);
1da177e4
LT
2570
2571int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2572{
2573 /* Mirror missing mmap method error code */
2574 return -ENODEV;
2575}
2a91525c 2576EXPORT_SYMBOL(sock_no_mmap);
1da177e4
LT
2577
2578ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2579{
2580 ssize_t res;
2581 struct msghdr msg = {.msg_flags = flags};
2582 struct kvec iov;
2583 char *kaddr = kmap(page);
2584 iov.iov_base = kaddr + offset;
2585 iov.iov_len = size;
2586 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2587 kunmap(page);
2588 return res;
2589}
2a91525c 2590EXPORT_SYMBOL(sock_no_sendpage);
1da177e4 2591
306b13eb
TH
2592ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2593 int offset, size_t size, int flags)
2594{
2595 ssize_t res;
2596 struct msghdr msg = {.msg_flags = flags};
2597 struct kvec iov;
2598 char *kaddr = kmap(page);
2599
2600 iov.iov_base = kaddr + offset;
2601 iov.iov_len = size;
2602 res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2603 kunmap(page);
2604 return res;
2605}
2606EXPORT_SYMBOL(sock_no_sendpage_locked);
2607
1da177e4
LT
2608/*
2609 * Default Socket Callbacks
2610 */
2611
2612static void sock_def_wakeup(struct sock *sk)
2613{
43815482
ED
2614 struct socket_wq *wq;
2615
2616 rcu_read_lock();
2617 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2618 if (skwq_has_sleeper(wq))
43815482
ED
2619 wake_up_interruptible_all(&wq->wait);
2620 rcu_read_unlock();
1da177e4
LT
2621}
2622
2623static void sock_def_error_report(struct sock *sk)
2624{
43815482
ED
2625 struct socket_wq *wq;
2626
2627 rcu_read_lock();
2628 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2629 if (skwq_has_sleeper(wq))
a9a08845 2630 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
8d8ad9d7 2631 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
43815482 2632 rcu_read_unlock();
1da177e4
LT
2633}
2634
676d2369 2635static void sock_def_readable(struct sock *sk)
1da177e4 2636{
43815482
ED
2637 struct socket_wq *wq;
2638
2639 rcu_read_lock();
2640 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2641 if (skwq_has_sleeper(wq))
a9a08845
LT
2642 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2643 EPOLLRDNORM | EPOLLRDBAND);
8d8ad9d7 2644 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
43815482 2645 rcu_read_unlock();
1da177e4
LT
2646}
2647
2648static void sock_def_write_space(struct sock *sk)
2649{
43815482
ED
2650 struct socket_wq *wq;
2651
2652 rcu_read_lock();
1da177e4
LT
2653
2654 /* Do not wake up a writer until he can make "significant"
2655 * progress. --DaveM
2656 */
14afee4b 2657 if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
43815482 2658 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2659 if (skwq_has_sleeper(wq))
a9a08845
LT
2660 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2661 EPOLLWRNORM | EPOLLWRBAND);
1da177e4
LT
2662
2663 /* Should agree with poll, otherwise some programs break */
2664 if (sock_writeable(sk))
8d8ad9d7 2665 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1da177e4
LT
2666 }
2667
43815482 2668 rcu_read_unlock();
1da177e4
LT
2669}
2670
2671static void sock_def_destruct(struct sock *sk)
2672{
1da177e4
LT
2673}
2674
2675void sk_send_sigurg(struct sock *sk)
2676{
2677 if (sk->sk_socket && sk->sk_socket->file)
2678 if (send_sigurg(&sk->sk_socket->file->f_owner))
8d8ad9d7 2679 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1da177e4 2680}
2a91525c 2681EXPORT_SYMBOL(sk_send_sigurg);
1da177e4
LT
2682
2683void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2684 unsigned long expires)
2685{
2686 if (!mod_timer(timer, expires))
2687 sock_hold(sk);
2688}
1da177e4
LT
2689EXPORT_SYMBOL(sk_reset_timer);
2690
2691void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2692{
25cc4ae9 2693 if (del_timer(timer))
1da177e4
LT
2694 __sock_put(sk);
2695}
1da177e4
LT
2696EXPORT_SYMBOL(sk_stop_timer);
2697
2698void sock_init_data(struct socket *sock, struct sock *sk)
2699{
581319c5 2700 sk_init_common(sk);
1da177e4
LT
2701 sk->sk_send_head = NULL;
2702
99767f27 2703 timer_setup(&sk->sk_timer, NULL, 0);
4ec93edb 2704
1da177e4
LT
2705 sk->sk_allocation = GFP_KERNEL;
2706 sk->sk_rcvbuf = sysctl_rmem_default;
2707 sk->sk_sndbuf = sysctl_wmem_default;
2708 sk->sk_state = TCP_CLOSE;
972692e0 2709 sk_set_socket(sk, sock);
1da177e4
LT
2710
2711 sock_set_flag(sk, SOCK_ZAPPED);
2712
e71a4783 2713 if (sock) {
1da177e4 2714 sk->sk_type = sock->type;
43815482 2715 sk->sk_wq = sock->wq;
1da177e4 2716 sock->sk = sk;
86741ec2
LC
2717 sk->sk_uid = SOCK_INODE(sock)->i_uid;
2718 } else {
43815482 2719 sk->sk_wq = NULL;
86741ec2
LC
2720 sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0);
2721 }
1da177e4 2722
1da177e4 2723 rwlock_init(&sk->sk_callback_lock);
cdfbabfb
DH
2724 if (sk->sk_kern_sock)
2725 lockdep_set_class_and_name(
2726 &sk->sk_callback_lock,
2727 af_kern_callback_keys + sk->sk_family,
2728 af_family_kern_clock_key_strings[sk->sk_family]);
2729 else
2730 lockdep_set_class_and_name(
2731 &sk->sk_callback_lock,
443aef0e
PZ
2732 af_callback_keys + sk->sk_family,
2733 af_family_clock_key_strings[sk->sk_family]);
1da177e4
LT
2734
2735 sk->sk_state_change = sock_def_wakeup;
2736 sk->sk_data_ready = sock_def_readable;
2737 sk->sk_write_space = sock_def_write_space;
2738 sk->sk_error_report = sock_def_error_report;
2739 sk->sk_destruct = sock_def_destruct;
2740
5640f768
ED
2741 sk->sk_frag.page = NULL;
2742 sk->sk_frag.offset = 0;
ef64a54f 2743 sk->sk_peek_off = -1;
1da177e4 2744
109f6e39
EB
2745 sk->sk_peer_pid = NULL;
2746 sk->sk_peer_cred = NULL;
1da177e4
LT
2747 sk->sk_write_pending = 0;
2748 sk->sk_rcvlowat = 1;
2749 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
2750 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
2751
6c7c98ba 2752 sk->sk_stamp = SK_DEFAULT_STAMP;
52267790 2753 atomic_set(&sk->sk_zckey, 0);
1da177e4 2754
e0d1095a 2755#ifdef CONFIG_NET_RX_BUSY_POLL
06021292 2756 sk->sk_napi_id = 0;
64b0dc51 2757 sk->sk_ll_usec = sysctl_net_busy_read;
06021292
ET
2758#endif
2759
76a9ebe8
ED
2760 sk->sk_max_pacing_rate = ~0UL;
2761 sk->sk_pacing_rate = ~0UL;
3a9b76fd 2762 sk->sk_pacing_shift = 10;
70da268b 2763 sk->sk_incoming_cpu = -1;
c6345ce7
AN
2764
2765 sk_rx_queue_clear(sk);
4dc6dc71
ED
2766 /*
2767 * Before updating sk_refcnt, we must commit prior changes to memory
2768 * (Documentation/RCU/rculist_nulls.txt for details)
2769 */
2770 smp_wmb();
41c6d650 2771 refcount_set(&sk->sk_refcnt, 1);
33c732c3 2772 atomic_set(&sk->sk_drops, 0);
1da177e4 2773}
2a91525c 2774EXPORT_SYMBOL(sock_init_data);
1da177e4 2775
b5606c2d 2776void lock_sock_nested(struct sock *sk, int subclass)
1da177e4
LT
2777{
2778 might_sleep();
a5b5bb9a 2779 spin_lock_bh(&sk->sk_lock.slock);
d2e9117c 2780 if (sk->sk_lock.owned)
1da177e4 2781 __lock_sock(sk);
d2e9117c 2782 sk->sk_lock.owned = 1;
a5b5bb9a
IM
2783 spin_unlock(&sk->sk_lock.slock);
2784 /*
2785 * The sk_lock has mutex_lock() semantics here:
2786 */
fcc70d5f 2787 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
a5b5bb9a 2788 local_bh_enable();
1da177e4 2789}
fcc70d5f 2790EXPORT_SYMBOL(lock_sock_nested);
1da177e4 2791
b5606c2d 2792void release_sock(struct sock *sk)
1da177e4 2793{
a5b5bb9a 2794 spin_lock_bh(&sk->sk_lock.slock);
1da177e4
LT
2795 if (sk->sk_backlog.tail)
2796 __release_sock(sk);
46d3ceab 2797
c3f9b018
ED
2798 /* Warning : release_cb() might need to release sk ownership,
2799 * ie call sock_release_ownership(sk) before us.
2800 */
46d3ceab
ED
2801 if (sk->sk_prot->release_cb)
2802 sk->sk_prot->release_cb(sk);
2803
c3f9b018 2804 sock_release_ownership(sk);
a5b5bb9a
IM
2805 if (waitqueue_active(&sk->sk_lock.wq))
2806 wake_up(&sk->sk_lock.wq);
2807 spin_unlock_bh(&sk->sk_lock.slock);
1da177e4
LT
2808}
2809EXPORT_SYMBOL(release_sock);
2810
8a74ad60
ED
2811/**
2812 * lock_sock_fast - fast version of lock_sock
2813 * @sk: socket
2814 *
2815 * This version should be used for very small section, where process wont block
d651983d
MCC
2816 * return false if fast path is taken:
2817 *
8a74ad60 2818 * sk_lock.slock locked, owned = 0, BH disabled
d651983d
MCC
2819 *
2820 * return true if slow path is taken:
2821 *
8a74ad60
ED
2822 * sk_lock.slock unlocked, owned = 1, BH enabled
2823 */
2824bool lock_sock_fast(struct sock *sk)
2825{
2826 might_sleep();
2827 spin_lock_bh(&sk->sk_lock.slock);
2828
2829 if (!sk->sk_lock.owned)
2830 /*
2831 * Note : We must disable BH
2832 */
2833 return false;
2834
2835 __lock_sock(sk);
2836 sk->sk_lock.owned = 1;
2837 spin_unlock(&sk->sk_lock.slock);
2838 /*
2839 * The sk_lock has mutex_lock() semantics here:
2840 */
2841 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2842 local_bh_enable();
2843 return true;
2844}
2845EXPORT_SYMBOL(lock_sock_fast);
2846
1da177e4 2847int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
4ec93edb 2848{
b7aa0bf7 2849 struct timeval tv;
9dae3497
YS
2850
2851 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
b7aa0bf7
ED
2852 tv = ktime_to_timeval(sk->sk_stamp);
2853 if (tv.tv_sec == -1)
1da177e4 2854 return -ENOENT;
b7aa0bf7
ED
2855 if (tv.tv_sec == 0) {
2856 sk->sk_stamp = ktime_get_real();
2857 tv = ktime_to_timeval(sk->sk_stamp);
2858 }
2859 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
4ec93edb 2860}
1da177e4
LT
2861EXPORT_SYMBOL(sock_get_timestamp);
2862
ae40eb1e
ED
2863int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2864{
2865 struct timespec ts;
9dae3497
YS
2866
2867 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
ae40eb1e
ED
2868 ts = ktime_to_timespec(sk->sk_stamp);
2869 if (ts.tv_sec == -1)
2870 return -ENOENT;
2871 if (ts.tv_sec == 0) {
2872 sk->sk_stamp = ktime_get_real();
2873 ts = ktime_to_timespec(sk->sk_stamp);
2874 }
2875 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2876}
2877EXPORT_SYMBOL(sock_get_timestampns);
2878
20d49473 2879void sock_enable_timestamp(struct sock *sk, int flag)
4ec93edb 2880{
20d49473 2881 if (!sock_flag(sk, flag)) {
08e29af3
ED
2882 unsigned long previous_flags = sk->sk_flags;
2883
20d49473
PO
2884 sock_set_flag(sk, flag);
2885 /*
2886 * we just set one of the two flags which require net
2887 * time stamping, but time stamping might have been on
2888 * already because of the other one
2889 */
080a270f
HFS
2890 if (sock_needs_netstamp(sk) &&
2891 !(previous_flags & SK_FLAGS_TIMESTAMP))
20d49473 2892 net_enable_timestamp();
1da177e4
LT
2893 }
2894}
1da177e4 2895
cb820f8e
RC
2896int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2897 int level, int type)
2898{
2899 struct sock_exterr_skb *serr;
364a9e93 2900 struct sk_buff *skb;
cb820f8e
RC
2901 int copied, err;
2902
2903 err = -EAGAIN;
364a9e93 2904 skb = sock_dequeue_err_skb(sk);
cb820f8e
RC
2905 if (skb == NULL)
2906 goto out;
2907
2908 copied = skb->len;
2909 if (copied > len) {
2910 msg->msg_flags |= MSG_TRUNC;
2911 copied = len;
2912 }
51f3d02b 2913 err = skb_copy_datagram_msg(skb, 0, msg, copied);
cb820f8e
RC
2914 if (err)
2915 goto out_free_skb;
2916
2917 sock_recv_timestamp(msg, sk, skb);
2918
2919 serr = SKB_EXT_ERR(skb);
2920 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2921
2922 msg->msg_flags |= MSG_ERRQUEUE;
2923 err = copied;
2924
cb820f8e
RC
2925out_free_skb:
2926 kfree_skb(skb);
2927out:
2928 return err;
2929}
2930EXPORT_SYMBOL(sock_recv_errqueue);
2931
1da177e4
LT
2932/*
2933 * Get a socket option on an socket.
2934 *
2935 * FIX: POSIX 1003.1g is very ambiguous here. It states that
2936 * asynchronous errors should be reported by getsockopt. We assume
2937 * this means if you specify SO_ERROR (otherwise whats the point of it).
2938 */
2939int sock_common_getsockopt(struct socket *sock, int level, int optname,
2940 char __user *optval, int __user *optlen)
2941{
2942 struct sock *sk = sock->sk;
2943
2944 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2945}
1da177e4
LT
2946EXPORT_SYMBOL(sock_common_getsockopt);
2947
3fdadf7d 2948#ifdef CONFIG_COMPAT
543d9cfe
ACM
2949int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2950 char __user *optval, int __user *optlen)
3fdadf7d
DM
2951{
2952 struct sock *sk = sock->sk;
2953
1e51f951 2954 if (sk->sk_prot->compat_getsockopt != NULL)
543d9cfe
ACM
2955 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2956 optval, optlen);
3fdadf7d
DM
2957 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2958}
2959EXPORT_SYMBOL(compat_sock_common_getsockopt);
2960#endif
2961
1b784140
YX
2962int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2963 int flags)
1da177e4
LT
2964{
2965 struct sock *sk = sock->sk;
2966 int addr_len = 0;
2967 int err;
2968
1b784140 2969 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
1da177e4
LT
2970 flags & ~MSG_DONTWAIT, &addr_len);
2971 if (err >= 0)
2972 msg->msg_namelen = addr_len;
2973 return err;
2974}
1da177e4
LT
2975EXPORT_SYMBOL(sock_common_recvmsg);
2976
2977/*
2978 * Set socket options on an inet socket.
2979 */
2980int sock_common_setsockopt(struct socket *sock, int level, int optname,
b7058842 2981 char __user *optval, unsigned int optlen)
1da177e4
LT
2982{
2983 struct sock *sk = sock->sk;
2984
2985 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2986}
1da177e4
LT
2987EXPORT_SYMBOL(sock_common_setsockopt);
2988
3fdadf7d 2989#ifdef CONFIG_COMPAT
543d9cfe 2990int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
b7058842 2991 char __user *optval, unsigned int optlen)
3fdadf7d
DM
2992{
2993 struct sock *sk = sock->sk;
2994
543d9cfe
ACM
2995 if (sk->sk_prot->compat_setsockopt != NULL)
2996 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2997 optval, optlen);
3fdadf7d
DM
2998 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2999}
3000EXPORT_SYMBOL(compat_sock_common_setsockopt);
3001#endif
3002
1da177e4
LT
3003void sk_common_release(struct sock *sk)
3004{
3005 if (sk->sk_prot->destroy)
3006 sk->sk_prot->destroy(sk);
3007
3008 /*
3009 * Observation: when sock_common_release is called, processes have
3010 * no access to socket. But net still has.
3011 * Step one, detach it from networking:
3012 *
3013 * A. Remove from hash tables.
3014 */
3015
3016 sk->sk_prot->unhash(sk);
3017
3018 /*
3019 * In this point socket cannot receive new packets, but it is possible
3020 * that some packets are in flight because some CPU runs receiver and
3021 * did hash table lookup before we unhashed socket. They will achieve
3022 * receive queue and will be purged by socket destructor.
3023 *
3024 * Also we still have packets pending on receive queue and probably,
3025 * our own packets waiting in device queues. sock_destroy will drain
3026 * receive queue, but transmitted packets will delay socket destruction
3027 * until the last reference will be released.
3028 */
3029
3030 sock_orphan(sk);
3031
3032 xfrm_sk_free_policy(sk);
3033
e6848976 3034 sk_refcnt_debug_release(sk);
5640f768 3035
1da177e4
LT
3036 sock_put(sk);
3037}
1da177e4
LT
3038EXPORT_SYMBOL(sk_common_release);
3039
a2d133b1
JH
3040void sk_get_meminfo(const struct sock *sk, u32 *mem)
3041{
3042 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3043
3044 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3045 mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
3046 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3047 mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
3048 mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3049 mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
3050 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3051 mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
3052 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3053}
3054
13ff3d6f
PE
3055#ifdef CONFIG_PROC_FS
3056#define PROTO_INUSE_NR 64 /* should be enough for the first time */
1338d466
PE
3057struct prot_inuse {
3058 int val[PROTO_INUSE_NR];
3059};
13ff3d6f
PE
3060
3061static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
70ee1159 3062
70ee1159
PE
3063void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3064{
08fc7f81 3065 __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
70ee1159
PE
3066}
3067EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3068
3069int sock_prot_inuse_get(struct net *net, struct proto *prot)
3070{
3071 int cpu, idx = prot->inuse_idx;
3072 int res = 0;
3073
3074 for_each_possible_cpu(cpu)
08fc7f81 3075 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
70ee1159
PE
3076
3077 return res >= 0 ? res : 0;
3078}
3079EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3080
648845ab
TZ
3081static void sock_inuse_add(struct net *net, int val)
3082{
3083 this_cpu_add(*net->core.sock_inuse, val);
3084}
3085
3086int sock_inuse_get(struct net *net)
3087{
3088 int cpu, res = 0;
3089
3090 for_each_possible_cpu(cpu)
3091 res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3092
3093 return res;
3094}
3095
3096EXPORT_SYMBOL_GPL(sock_inuse_get);
3097
2c8c1e72 3098static int __net_init sock_inuse_init_net(struct net *net)
70ee1159 3099{
08fc7f81 3100 net->core.prot_inuse = alloc_percpu(struct prot_inuse);
648845ab
TZ
3101 if (net->core.prot_inuse == NULL)
3102 return -ENOMEM;
3103
3104 net->core.sock_inuse = alloc_percpu(int);
3105 if (net->core.sock_inuse == NULL)
3106 goto out;
3107
3108 return 0;
3109
3110out:
3111 free_percpu(net->core.prot_inuse);
3112 return -ENOMEM;
70ee1159
PE
3113}
3114
2c8c1e72 3115static void __net_exit sock_inuse_exit_net(struct net *net)
70ee1159 3116{
08fc7f81 3117 free_percpu(net->core.prot_inuse);
648845ab 3118 free_percpu(net->core.sock_inuse);
70ee1159
PE
3119}
3120
3121static struct pernet_operations net_inuse_ops = {
3122 .init = sock_inuse_init_net,
3123 .exit = sock_inuse_exit_net,
3124};
3125
3126static __init int net_inuse_init(void)
3127{
3128 if (register_pernet_subsys(&net_inuse_ops))
3129 panic("Cannot initialize net inuse counters");
3130
3131 return 0;
3132}
3133
3134core_initcall(net_inuse_init);
13ff3d6f
PE
3135
3136static void assign_proto_idx(struct proto *prot)
3137{
3138 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3139
3140 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
e005d193 3141 pr_err("PROTO_INUSE_NR exhausted\n");
13ff3d6f
PE
3142 return;
3143 }
3144
3145 set_bit(prot->inuse_idx, proto_inuse_idx);
3146}
3147
3148static void release_proto_idx(struct proto *prot)
3149{
3150 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3151 clear_bit(prot->inuse_idx, proto_inuse_idx);
3152}
3153#else
3154static inline void assign_proto_idx(struct proto *prot)
3155{
3156}
3157
3158static inline void release_proto_idx(struct proto *prot)
3159{
3160}
648845ab
TZ
3161
3162static void sock_inuse_add(struct net *net, int val)
3163{
3164}
13ff3d6f
PE
3165#endif
3166
0159dfd3
ED
3167static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3168{
3169 if (!rsk_prot)
3170 return;
3171 kfree(rsk_prot->slab_name);
3172 rsk_prot->slab_name = NULL;
adf78eda
JL
3173 kmem_cache_destroy(rsk_prot->slab);
3174 rsk_prot->slab = NULL;
0159dfd3
ED
3175}
3176
3177static int req_prot_init(const struct proto *prot)
3178{
3179 struct request_sock_ops *rsk_prot = prot->rsk_prot;
3180
3181 if (!rsk_prot)
3182 return 0;
3183
3184 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3185 prot->name);
3186 if (!rsk_prot->slab_name)
3187 return -ENOMEM;
3188
3189 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3190 rsk_prot->obj_size, 0,
e699e2c6
SB
3191 SLAB_ACCOUNT | prot->slab_flags,
3192 NULL);
0159dfd3
ED
3193
3194 if (!rsk_prot->slab) {
3195 pr_crit("%s: Can't create request sock SLAB cache!\n",
3196 prot->name);
3197 return -ENOMEM;
3198 }
3199 return 0;
3200}
3201
b733c007
PE
3202int proto_register(struct proto *prot, int alloc_slab)
3203{
1da177e4 3204 if (alloc_slab) {
30c2c9f1
DW
3205 prot->slab = kmem_cache_create_usercopy(prot->name,
3206 prot->obj_size, 0,
e699e2c6
SB
3207 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3208 prot->slab_flags,
289a4860 3209 prot->useroffset, prot->usersize,
271b72c7 3210 NULL);
1da177e4
LT
3211
3212 if (prot->slab == NULL) {
e005d193
JP
3213 pr_crit("%s: Can't create sock SLAB cache!\n",
3214 prot->name);
60e7663d 3215 goto out;
1da177e4 3216 }
2e6599cb 3217
0159dfd3
ED
3218 if (req_prot_init(prot))
3219 goto out_free_request_sock_slab;
8feaf0c0 3220
6d6ee43e 3221 if (prot->twsk_prot != NULL) {
faf23422 3222 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
8feaf0c0 3223
7e56b5d6 3224 if (prot->twsk_prot->twsk_slab_name == NULL)
8feaf0c0
ACM
3225 goto out_free_request_sock_slab;
3226
6d6ee43e 3227 prot->twsk_prot->twsk_slab =
7e56b5d6 3228 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
6d6ee43e 3229 prot->twsk_prot->twsk_obj_size,
3ab5aee7 3230 0,
e699e2c6 3231 SLAB_ACCOUNT |
52db70dc 3232 prot->slab_flags,
20c2df83 3233 NULL);
6d6ee43e 3234 if (prot->twsk_prot->twsk_slab == NULL)
8feaf0c0
ACM
3235 goto out_free_timewait_sock_slab_name;
3236 }
1da177e4
LT
3237 }
3238
36b77a52 3239 mutex_lock(&proto_list_mutex);
1da177e4 3240 list_add(&prot->node, &proto_list);
13ff3d6f 3241 assign_proto_idx(prot);
36b77a52 3242 mutex_unlock(&proto_list_mutex);
b733c007
PE
3243 return 0;
3244
8feaf0c0 3245out_free_timewait_sock_slab_name:
7e56b5d6 3246 kfree(prot->twsk_prot->twsk_slab_name);
8feaf0c0 3247out_free_request_sock_slab:
0159dfd3
ED
3248 req_prot_cleanup(prot->rsk_prot);
3249
2e6599cb
ACM
3250 kmem_cache_destroy(prot->slab);
3251 prot->slab = NULL;
b733c007
PE
3252out:
3253 return -ENOBUFS;
1da177e4 3254}
1da177e4
LT
3255EXPORT_SYMBOL(proto_register);
3256
3257void proto_unregister(struct proto *prot)
3258{
36b77a52 3259 mutex_lock(&proto_list_mutex);
13ff3d6f 3260 release_proto_idx(prot);
0a3f4358 3261 list_del(&prot->node);
36b77a52 3262 mutex_unlock(&proto_list_mutex);
1da177e4 3263
adf78eda
JL
3264 kmem_cache_destroy(prot->slab);
3265 prot->slab = NULL;
1da177e4 3266
0159dfd3 3267 req_prot_cleanup(prot->rsk_prot);
2e6599cb 3268
6d6ee43e 3269 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
6d6ee43e 3270 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
7e56b5d6 3271 kfree(prot->twsk_prot->twsk_slab_name);
6d6ee43e 3272 prot->twsk_prot->twsk_slab = NULL;
8feaf0c0 3273 }
1da177e4 3274}
1da177e4
LT
3275EXPORT_SYMBOL(proto_unregister);
3276
bf2ae2e4
XL
3277int sock_load_diag_module(int family, int protocol)
3278{
3279 if (!protocol) {
3280 if (!sock_is_registered(family))
3281 return -ENOENT;
3282
3283 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3284 NETLINK_SOCK_DIAG, family);
3285 }
3286
3287#ifdef CONFIG_INET
3288 if (family == AF_INET &&
c34c1287 3289 protocol != IPPROTO_RAW &&
bf2ae2e4
XL
3290 !rcu_access_pointer(inet_protos[protocol]))
3291 return -ENOENT;
3292#endif
3293
3294 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3295 NETLINK_SOCK_DIAG, family, protocol);
3296}
3297EXPORT_SYMBOL(sock_load_diag_module);
3298
1da177e4 3299#ifdef CONFIG_PROC_FS
1da177e4 3300static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
36b77a52 3301 __acquires(proto_list_mutex)
1da177e4 3302{
36b77a52 3303 mutex_lock(&proto_list_mutex);
60f0438a 3304 return seq_list_start_head(&proto_list, *pos);
1da177e4
LT
3305}
3306
3307static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3308{
60f0438a 3309 return seq_list_next(v, &proto_list, pos);
1da177e4
LT
3310}
3311
3312static void proto_seq_stop(struct seq_file *seq, void *v)
36b77a52 3313 __releases(proto_list_mutex)
1da177e4 3314{
36b77a52 3315 mutex_unlock(&proto_list_mutex);
1da177e4
LT
3316}
3317
3318static char proto_method_implemented(const void *method)
3319{
3320 return method == NULL ? 'n' : 'y';
3321}
180d8cd9
GC
3322static long sock_prot_memory_allocated(struct proto *proto)
3323{
cb75a36c 3324 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
180d8cd9
GC
3325}
3326
3327static char *sock_prot_memory_pressure(struct proto *proto)
3328{
3329 return proto->memory_pressure != NULL ?
3330 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3331}
1da177e4
LT
3332
3333static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3334{
180d8cd9 3335
8d987e5c 3336 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
1da177e4
LT
3337 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3338 proto->name,
3339 proto->obj_size,
14e943db 3340 sock_prot_inuse_get(seq_file_net(seq), proto),
180d8cd9
GC
3341 sock_prot_memory_allocated(proto),
3342 sock_prot_memory_pressure(proto),
1da177e4
LT
3343 proto->max_header,
3344 proto->slab == NULL ? "no" : "yes",
3345 module_name(proto->owner),
3346 proto_method_implemented(proto->close),
3347 proto_method_implemented(proto->connect),
3348 proto_method_implemented(proto->disconnect),
3349 proto_method_implemented(proto->accept),
3350 proto_method_implemented(proto->ioctl),
3351 proto_method_implemented(proto->init),
3352 proto_method_implemented(proto->destroy),
3353 proto_method_implemented(proto->shutdown),
3354 proto_method_implemented(proto->setsockopt),
3355 proto_method_implemented(proto->getsockopt),
3356 proto_method_implemented(proto->sendmsg),
3357 proto_method_implemented(proto->recvmsg),
3358 proto_method_implemented(proto->sendpage),
3359 proto_method_implemented(proto->bind),
3360 proto_method_implemented(proto->backlog_rcv),
3361 proto_method_implemented(proto->hash),
3362 proto_method_implemented(proto->unhash),
3363 proto_method_implemented(proto->get_port),
3364 proto_method_implemented(proto->enter_memory_pressure));
3365}
3366
3367static int proto_seq_show(struct seq_file *seq, void *v)
3368{
60f0438a 3369 if (v == &proto_list)
1da177e4
LT
3370 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3371 "protocol",
3372 "size",
3373 "sockets",
3374 "memory",
3375 "press",
3376 "maxhdr",
3377 "slab",
3378 "module",
3379 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3380 else
60f0438a 3381 proto_seq_printf(seq, list_entry(v, struct proto, node));
1da177e4
LT
3382 return 0;
3383}
3384
f690808e 3385static const struct seq_operations proto_seq_ops = {
1da177e4
LT
3386 .start = proto_seq_start,
3387 .next = proto_seq_next,
3388 .stop = proto_seq_stop,
3389 .show = proto_seq_show,
3390};
3391
14e943db
ED
3392static __net_init int proto_init_net(struct net *net)
3393{
c3506372
CH
3394 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3395 sizeof(struct seq_net_private)))
14e943db
ED
3396 return -ENOMEM;
3397
3398 return 0;
3399}
3400
3401static __net_exit void proto_exit_net(struct net *net)
3402{
ece31ffd 3403 remove_proc_entry("protocols", net->proc_net);
14e943db
ED
3404}
3405
3406
3407static __net_initdata struct pernet_operations proto_net_ops = {
3408 .init = proto_init_net,
3409 .exit = proto_exit_net,
1da177e4
LT
3410};
3411
3412static int __init proto_init(void)
3413{
14e943db 3414 return register_pernet_subsys(&proto_net_ops);
1da177e4
LT
3415}
3416
3417subsys_initcall(proto_init);
3418
3419#endif /* PROC_FS */
7db6b048
SS
3420
3421#ifdef CONFIG_NET_RX_BUSY_POLL
3422bool sk_busy_loop_end(void *p, unsigned long start_time)
3423{
3424 struct sock *sk = p;
3425
3426 return !skb_queue_empty(&sk->sk_receive_queue) ||
3427 sk_busy_loop_timeout(sk, start_time);
3428}
3429EXPORT_SYMBOL(sk_busy_loop_end);
3430#endif /* CONFIG_NET_RX_BUSY_POLL */