1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
7 * PACKET - implements raw packet sockets.
10 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11 * Alan Cox, <gw4pts@gw4pts.ampr.org>
14 * Alan Cox : verify_area() now used correctly
15 * Alan Cox : new skbuff lists, look ma no backlogs!
16 * Alan Cox : tidied skbuff lists.
17 * Alan Cox : Now uses generic datagram routines I
18 * added. Also fixed the peek/read crash
19 * from all old Linux datagram code.
20 * Alan Cox : Uses the improved datagram code.
21 * Alan Cox : Added NULL's for socket options.
22 * Alan Cox : Re-commented the code.
23 * Alan Cox : Use new kernel side addressing
24 * Rob Janssen : Correct MTU usage.
25 * Dave Platt : Counter leaks caused by incorrect
26 * interrupt locking and some slightly
27 * dubious gcc output. Can you read
28 * compiler: it said _VOLATILE_
29 * Richard Kooijman : Timestamp fixes.
30 * Alan Cox : New buffers. Use sk->mac.raw.
31 * Alan Cox : sendmsg/recvmsg support.
32 * Alan Cox : Protocol setting support
33 * Alexey Kuznetsov : Untied from IPv4 stack.
34 * Cyrus Durgin : Fixed kerneld for kmod.
35 * Michal Ostrowski : Module initialization cleanup.
36 * Ulises Alonso : Frame number limit removal and
37 * packet_set_ring memory leak.
38 * Eric Biederman : Allow for > 8 byte hardware addresses.
39 * The convention is that longer addresses
40 * will simply extend the hardware address
41 * byte arrays at the end of sockaddr_ll
43 * Johann Baudy : Added TX RING.
44 * Chetan Loke : Implemented TPACKET_V3 block abstraction
46 * Copyright (C) 2011, <lokec@ccs.neu.edu>
49 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
51 #include <linux/ethtool.h>
52 #include <linux/filter.h>
53 #include <linux/types.h>
55 #include <linux/capability.h>
56 #include <linux/fcntl.h>
57 #include <linux/socket.h>
59 #include <linux/inet.h>
60 #include <linux/netdevice.h>
61 #include <linux/if_packet.h>
62 #include <linux/wireless.h>
63 #include <linux/kernel.h>
64 #include <linux/kmod.h>
65 #include <linux/slab.h>
66 #include <linux/vmalloc.h>
67 #include <net/net_namespace.h>
69 #include <net/protocol.h>
70 #include <linux/skbuff.h>
72 #include <linux/errno.h>
73 #include <linux/timer.h>
74 #include <linux/uaccess.h>
75 #include <asm/ioctls.h>
77 #include <asm/cacheflush.h>
79 #include <linux/proc_fs.h>
80 #include <linux/seq_file.h>
81 #include <linux/poll.h>
82 #include <linux/module.h>
83 #include <linux/init.h>
84 #include <linux/mutex.h>
85 #include <linux/if_vlan.h>
86 #include <linux/virtio_net.h>
87 #include <linux/errqueue.h>
88 #include <linux/net_tstamp.h>
89 #include <linux/percpu.h>
91 #include <net/inet_common.h>
93 #include <linux/bpf.h>
94 #include <net/compat.h>
95 #include <linux/netfilter_netdev.h>
101 - If the device has no dev->header_ops->create, there is no LL header
102 visible above the device. In this case, its hard_header_len should be 0.
103 The device may prepend its own header internally. In this case, its
104 needed_headroom should be set to the space needed for it to add its
106 For example, a WiFi driver pretending to be an Ethernet driver should
107 set its hard_header_len to be the Ethernet header length, and set its
108 needed_headroom to be (the real WiFi header length - the fake Ethernet
110 - packet socket receives packets with pulled ll header,
111 so that SOCK_RAW should push it back.
116 Incoming, dev_has_header(dev) == true
117 mac_header -> ll header
120 Outgoing, dev_has_header(dev) == true
121 mac_header -> ll header
124 Incoming, dev_has_header(dev) == false
126 However drivers often make it point to the ll header.
127 This is incorrect because the ll header should be invisible to us.
130 Outgoing, dev_has_header(dev) == false
131 mac_header -> data. ll header is invisible to us.
135 If dev_has_header(dev) == false we are unable to restore the ll header,
136 because it is invisible to us.
142 dev_has_header(dev) == true
143 mac_header -> ll header
146 dev_has_header(dev) == false (ll header is invisible to us)
150 We should set network_header on output to the correct position,
151 packet classifier depends on it.
154 /* Private packet socket structures. */
156 /* identical to struct packet_mreq except it has
157 * a longer address field.
159 struct packet_mreq_max
{
161 unsigned short mr_type
;
162 unsigned short mr_alen
;
163 unsigned char mr_address
[MAX_ADDR_LEN
];
167 struct tpacket_hdr
*h1
;
168 struct tpacket2_hdr
*h2
;
169 struct tpacket3_hdr
*h3
;
173 static int packet_set_ring(struct sock
*sk
, union tpacket_req_u
*req_u
,
174 int closing
, int tx_ring
);
176 #define V3_ALIGNMENT (8)
178 #define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
180 #define BLK_PLUS_PRIV(sz_of_priv) \
181 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
183 #define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
184 #define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
185 #define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
186 #define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
187 #define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
188 #define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
191 static int tpacket_rcv(struct sk_buff
*skb
, struct net_device
*dev
,
192 struct packet_type
*pt
, struct net_device
*orig_dev
);
194 static void *packet_previous_frame(struct packet_sock
*po
,
195 struct packet_ring_buffer
*rb
,
197 static void packet_increment_head(struct packet_ring_buffer
*buff
);
198 static int prb_curr_blk_in_use(struct tpacket_block_desc
*);
199 static void *prb_dispatch_next_block(struct tpacket_kbdq_core
*,
200 struct packet_sock
*);
201 static void prb_retire_current_block(struct tpacket_kbdq_core
*,
202 struct packet_sock
*, unsigned int status
);
203 static int prb_queue_frozen(struct tpacket_kbdq_core
*);
204 static void prb_open_block(struct tpacket_kbdq_core
*,
205 struct tpacket_block_desc
*);
206 static void prb_retire_rx_blk_timer_expired(struct timer_list
*);
207 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core
*);
208 static void prb_fill_rxhash(struct tpacket_kbdq_core
*, struct tpacket3_hdr
*);
209 static void prb_clear_rxhash(struct tpacket_kbdq_core
*,
210 struct tpacket3_hdr
*);
211 static void prb_fill_vlan_info(struct tpacket_kbdq_core
*,
212 struct tpacket3_hdr
*);
213 static void packet_flush_mclist(struct sock
*sk
);
214 static u16
packet_pick_tx_queue(struct sk_buff
*skb
);
216 struct packet_skb_cb
{
218 struct sockaddr_pkt pkt
;
220 /* Trick: alias skb original length with
221 * ll.sll_family and ll.protocol in order
224 unsigned int origlen
;
225 struct sockaddr_ll ll
;
230 #define vio_le() virtio_legacy_is_little_endian()
232 #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
234 #define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
235 #define GET_PBLOCK_DESC(x, bid) \
236 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
237 #define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
238 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
239 #define GET_NEXT_PRB_BLK_NUM(x) \
240 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
241 ((x)->kactive_blk_num+1) : 0)
243 static void __fanout_unlink(struct sock
*sk
, struct packet_sock
*po
);
244 static void __fanout_link(struct sock
*sk
, struct packet_sock
*po
);
246 #ifdef CONFIG_NETFILTER_EGRESS
247 static noinline
struct sk_buff
*nf_hook_direct_egress(struct sk_buff
*skb
)
249 struct sk_buff
*next
, *head
= NULL
, *tail
;
253 for (; skb
!= NULL
; skb
= next
) {
255 skb_mark_not_on_list(skb
);
257 if (!nf_hook_egress(skb
, &rc
, skb
->dev
))
273 static int packet_xmit(const struct packet_sock
*po
, struct sk_buff
*skb
)
275 if (!packet_sock_flag(po
, PACKET_SOCK_QDISC_BYPASS
))
276 return dev_queue_xmit(skb
);
278 #ifdef CONFIG_NETFILTER_EGRESS
279 if (nf_hook_egress_active()) {
280 skb
= nf_hook_direct_egress(skb
);
282 return NET_XMIT_DROP
;
285 return dev_direct_xmit(skb
, packet_pick_tx_queue(skb
));
288 static struct net_device
*packet_cached_dev_get(struct packet_sock
*po
)
290 struct net_device
*dev
;
293 dev
= rcu_dereference(po
->cached_dev
);
300 static void packet_cached_dev_assign(struct packet_sock
*po
,
301 struct net_device
*dev
)
303 rcu_assign_pointer(po
->cached_dev
, dev
);
306 static void packet_cached_dev_reset(struct packet_sock
*po
)
308 RCU_INIT_POINTER(po
->cached_dev
, NULL
);
311 static u16
packet_pick_tx_queue(struct sk_buff
*skb
)
313 struct net_device
*dev
= skb
->dev
;
314 const struct net_device_ops
*ops
= dev
->netdev_ops
;
315 int cpu
= raw_smp_processor_id();
319 skb
->sender_cpu
= cpu
+ 1;
321 skb_record_rx_queue(skb
, cpu
% dev
->real_num_tx_queues
);
322 if (ops
->ndo_select_queue
) {
323 queue_index
= ops
->ndo_select_queue(dev
, skb
, NULL
);
324 queue_index
= netdev_cap_txqueue(dev
, queue_index
);
326 queue_index
= netdev_pick_tx(dev
, skb
, NULL
);
332 /* __register_prot_hook must be invoked through register_prot_hook
333 * or from a context in which asynchronous accesses to the packet
334 * socket is not possible (packet_create()).
336 static void __register_prot_hook(struct sock
*sk
)
338 struct packet_sock
*po
= pkt_sk(sk
);
340 if (!packet_sock_flag(po
, PACKET_SOCK_RUNNING
)) {
342 __fanout_link(sk
, po
);
344 dev_add_pack(&po
->prot_hook
);
347 packet_sock_flag_set(po
, PACKET_SOCK_RUNNING
, 1);
351 static void register_prot_hook(struct sock
*sk
)
353 lockdep_assert_held_once(&pkt_sk(sk
)->bind_lock
);
354 __register_prot_hook(sk
);
357 /* If the sync parameter is true, we will temporarily drop
358 * the po->bind_lock and do a synchronize_net to make sure no
359 * asynchronous packet processing paths still refer to the elements
360 * of po->prot_hook. If the sync parameter is false, it is the
361 * callers responsibility to take care of this.
363 static void __unregister_prot_hook(struct sock
*sk
, bool sync
)
365 struct packet_sock
*po
= pkt_sk(sk
);
367 lockdep_assert_held_once(&po
->bind_lock
);
369 packet_sock_flag_set(po
, PACKET_SOCK_RUNNING
, 0);
372 __fanout_unlink(sk
, po
);
374 __dev_remove_pack(&po
->prot_hook
);
379 spin_unlock(&po
->bind_lock
);
381 spin_lock(&po
->bind_lock
);
385 static void unregister_prot_hook(struct sock
*sk
, bool sync
)
387 struct packet_sock
*po
= pkt_sk(sk
);
389 if (packet_sock_flag(po
, PACKET_SOCK_RUNNING
))
390 __unregister_prot_hook(sk
, sync
);
393 static inline struct page
* __pure
pgv_to_page(void *addr
)
395 if (is_vmalloc_addr(addr
))
396 return vmalloc_to_page(addr
);
397 return virt_to_page(addr
);
400 static void __packet_set_status(struct packet_sock
*po
, void *frame
, int status
)
402 union tpacket_uhdr h
;
404 /* WRITE_ONCE() are paired with READ_ONCE() in __packet_get_status */
407 switch (po
->tp_version
) {
409 WRITE_ONCE(h
.h1
->tp_status
, status
);
410 flush_dcache_page(pgv_to_page(&h
.h1
->tp_status
));
413 WRITE_ONCE(h
.h2
->tp_status
, status
);
414 flush_dcache_page(pgv_to_page(&h
.h2
->tp_status
));
417 WRITE_ONCE(h
.h3
->tp_status
, status
);
418 flush_dcache_page(pgv_to_page(&h
.h3
->tp_status
));
421 WARN(1, "TPACKET version not supported.\n");
428 static int __packet_get_status(const struct packet_sock
*po
, void *frame
)
430 union tpacket_uhdr h
;
434 /* READ_ONCE() are paired with WRITE_ONCE() in __packet_set_status */
437 switch (po
->tp_version
) {
439 flush_dcache_page(pgv_to_page(&h
.h1
->tp_status
));
440 return READ_ONCE(h
.h1
->tp_status
);
442 flush_dcache_page(pgv_to_page(&h
.h2
->tp_status
));
443 return READ_ONCE(h
.h2
->tp_status
);
445 flush_dcache_page(pgv_to_page(&h
.h3
->tp_status
));
446 return READ_ONCE(h
.h3
->tp_status
);
448 WARN(1, "TPACKET version not supported.\n");
454 static __u32
tpacket_get_timestamp(struct sk_buff
*skb
, struct timespec64
*ts
,
457 struct skb_shared_hwtstamps
*shhwtstamps
= skb_hwtstamps(skb
);
460 (flags
& SOF_TIMESTAMPING_RAW_HARDWARE
) &&
461 ktime_to_timespec64_cond(shhwtstamps
->hwtstamp
, ts
))
462 return TP_STATUS_TS_RAW_HARDWARE
;
464 if ((flags
& SOF_TIMESTAMPING_SOFTWARE
) &&
465 ktime_to_timespec64_cond(skb_tstamp(skb
), ts
))
466 return TP_STATUS_TS_SOFTWARE
;
471 static __u32
__packet_set_timestamp(struct packet_sock
*po
, void *frame
,
474 union tpacket_uhdr h
;
475 struct timespec64 ts
;
478 if (!(ts_status
= tpacket_get_timestamp(skb
, &ts
, READ_ONCE(po
->tp_tstamp
))))
483 * versions 1 through 3 overflow the timestamps in y2106, since they
484 * all store the seconds in a 32-bit unsigned integer.
485 * If we create a version 4, that should have a 64-bit timestamp,
486 * either 64-bit seconds + 32-bit nanoseconds, or just 64-bit
489 switch (po
->tp_version
) {
491 h
.h1
->tp_sec
= ts
.tv_sec
;
492 h
.h1
->tp_usec
= ts
.tv_nsec
/ NSEC_PER_USEC
;
495 h
.h2
->tp_sec
= ts
.tv_sec
;
496 h
.h2
->tp_nsec
= ts
.tv_nsec
;
499 h
.h3
->tp_sec
= ts
.tv_sec
;
500 h
.h3
->tp_nsec
= ts
.tv_nsec
;
503 WARN(1, "TPACKET version not supported.\n");
507 /* one flush is safe, as both fields always lie on the same cacheline */
508 flush_dcache_page(pgv_to_page(&h
.h1
->tp_sec
));
514 static void *packet_lookup_frame(const struct packet_sock
*po
,
515 const struct packet_ring_buffer
*rb
,
516 unsigned int position
,
519 unsigned int pg_vec_pos
, frame_offset
;
520 union tpacket_uhdr h
;
522 pg_vec_pos
= position
/ rb
->frames_per_block
;
523 frame_offset
= position
% rb
->frames_per_block
;
525 h
.raw
= rb
->pg_vec
[pg_vec_pos
].buffer
+
526 (frame_offset
* rb
->frame_size
);
528 if (status
!= __packet_get_status(po
, h
.raw
))
534 static void *packet_current_frame(struct packet_sock
*po
,
535 struct packet_ring_buffer
*rb
,
538 return packet_lookup_frame(po
, rb
, rb
->head
, status
);
541 static void prb_del_retire_blk_timer(struct tpacket_kbdq_core
*pkc
)
543 del_timer_sync(&pkc
->retire_blk_timer
);
546 static void prb_shutdown_retire_blk_timer(struct packet_sock
*po
,
547 struct sk_buff_head
*rb_queue
)
549 struct tpacket_kbdq_core
*pkc
;
551 pkc
= GET_PBDQC_FROM_RB(&po
->rx_ring
);
553 spin_lock_bh(&rb_queue
->lock
);
554 pkc
->delete_blk_timer
= 1;
555 spin_unlock_bh(&rb_queue
->lock
);
557 prb_del_retire_blk_timer(pkc
);
560 static void prb_setup_retire_blk_timer(struct packet_sock
*po
)
562 struct tpacket_kbdq_core
*pkc
;
564 pkc
= GET_PBDQC_FROM_RB(&po
->rx_ring
);
565 timer_setup(&pkc
->retire_blk_timer
, prb_retire_rx_blk_timer_expired
,
567 pkc
->retire_blk_timer
.expires
= jiffies
;
570 static int prb_calc_retire_blk_tmo(struct packet_sock
*po
,
571 int blk_size_in_bytes
)
573 struct net_device
*dev
;
574 unsigned int mbits
, div
;
575 struct ethtool_link_ksettings ecmd
;
579 dev
= __dev_get_by_index(sock_net(&po
->sk
), po
->ifindex
);
580 if (unlikely(!dev
)) {
582 return DEFAULT_PRB_RETIRE_TOV
;
584 err
= __ethtool_get_link_ksettings(dev
, &ecmd
);
587 return DEFAULT_PRB_RETIRE_TOV
;
589 /* If the link speed is so slow you don't really
590 * need to worry about perf anyways
592 if (ecmd
.base
.speed
< SPEED_1000
||
593 ecmd
.base
.speed
== SPEED_UNKNOWN
)
594 return DEFAULT_PRB_RETIRE_TOV
;
596 div
= ecmd
.base
.speed
/ 1000;
597 mbits
= (blk_size_in_bytes
* 8) / (1024 * 1024);
607 static void prb_init_ft_ops(struct tpacket_kbdq_core
*p1
,
608 union tpacket_req_u
*req_u
)
610 p1
->feature_req_word
= req_u
->req3
.tp_feature_req_word
;
613 static void init_prb_bdqc(struct packet_sock
*po
,
614 struct packet_ring_buffer
*rb
,
616 union tpacket_req_u
*req_u
)
618 struct tpacket_kbdq_core
*p1
= GET_PBDQC_FROM_RB(rb
);
619 struct tpacket_block_desc
*pbd
;
621 memset(p1
, 0x0, sizeof(*p1
));
623 p1
->knxt_seq_num
= 1;
625 pbd
= (struct tpacket_block_desc
*)pg_vec
[0].buffer
;
626 p1
->pkblk_start
= pg_vec
[0].buffer
;
627 p1
->kblk_size
= req_u
->req3
.tp_block_size
;
628 p1
->knum_blocks
= req_u
->req3
.tp_block_nr
;
629 p1
->hdrlen
= po
->tp_hdrlen
;
630 p1
->version
= po
->tp_version
;
631 p1
->last_kactive_blk_num
= 0;
632 po
->stats
.stats3
.tp_freeze_q_cnt
= 0;
633 if (req_u
->req3
.tp_retire_blk_tov
)
634 p1
->retire_blk_tov
= req_u
->req3
.tp_retire_blk_tov
;
636 p1
->retire_blk_tov
= prb_calc_retire_blk_tmo(po
,
637 req_u
->req3
.tp_block_size
);
638 p1
->tov_in_jiffies
= msecs_to_jiffies(p1
->retire_blk_tov
);
639 p1
->blk_sizeof_priv
= req_u
->req3
.tp_sizeof_priv
;
640 rwlock_init(&p1
->blk_fill_in_prog_lock
);
642 p1
->max_frame_len
= p1
->kblk_size
- BLK_PLUS_PRIV(p1
->blk_sizeof_priv
);
643 prb_init_ft_ops(p1
, req_u
);
644 prb_setup_retire_blk_timer(po
);
645 prb_open_block(p1
, pbd
);
648 /* Do NOT update the last_blk_num first.
649 * Assumes sk_buff_head lock is held.
651 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core
*pkc
)
653 mod_timer(&pkc
->retire_blk_timer
,
654 jiffies
+ pkc
->tov_in_jiffies
);
655 pkc
->last_kactive_blk_num
= pkc
->kactive_blk_num
;
660 * 1) We refresh the timer only when we open a block.
661 * By doing this we don't waste cycles refreshing the timer
662 * on packet-by-packet basis.
664 * With a 1MB block-size, on a 1Gbps line, it will take
665 * i) ~8 ms to fill a block + ii) memcpy etc.
666 * In this cut we are not accounting for the memcpy time.
668 * So, if the user sets the 'tmo' to 10ms then the timer
669 * will never fire while the block is still getting filled
670 * (which is what we want). However, the user could choose
671 * to close a block early and that's fine.
673 * But when the timer does fire, we check whether or not to refresh it.
674 * Since the tmo granularity is in msecs, it is not too expensive
675 * to refresh the timer, lets say every '8' msecs.
676 * Either the user can set the 'tmo' or we can derive it based on
677 * a) line-speed and b) block-size.
678 * prb_calc_retire_blk_tmo() calculates the tmo.
681 static void prb_retire_rx_blk_timer_expired(struct timer_list
*t
)
683 struct packet_sock
*po
=
684 from_timer(po
, t
, rx_ring
.prb_bdqc
.retire_blk_timer
);
685 struct tpacket_kbdq_core
*pkc
= GET_PBDQC_FROM_RB(&po
->rx_ring
);
687 struct tpacket_block_desc
*pbd
;
689 spin_lock(&po
->sk
.sk_receive_queue
.lock
);
691 frozen
= prb_queue_frozen(pkc
);
692 pbd
= GET_CURR_PBLOCK_DESC_FROM_CORE(pkc
);
694 if (unlikely(pkc
->delete_blk_timer
))
697 /* We only need to plug the race when the block is partially filled.
699 * lock(); increment BLOCK_NUM_PKTS; unlock()
700 * copy_bits() is in progress ...
701 * timer fires on other cpu:
702 * we can't retire the current block because copy_bits
706 if (BLOCK_NUM_PKTS(pbd
)) {
707 /* Waiting for skb_copy_bits to finish... */
708 write_lock(&pkc
->blk_fill_in_prog_lock
);
709 write_unlock(&pkc
->blk_fill_in_prog_lock
);
712 if (pkc
->last_kactive_blk_num
== pkc
->kactive_blk_num
) {
714 if (!BLOCK_NUM_PKTS(pbd
)) {
715 /* An empty block. Just refresh the timer. */
718 prb_retire_current_block(pkc
, po
, TP_STATUS_BLK_TMO
);
719 if (!prb_dispatch_next_block(pkc
, po
))
724 /* Case 1. Queue was frozen because user-space was
727 if (prb_curr_blk_in_use(pbd
)) {
729 * Ok, user-space is still behind.
730 * So just refresh the timer.
734 /* Case 2. queue was frozen,user-space caught up,
735 * now the link went idle && the timer fired.
736 * We don't have a block to close.So we open this
737 * block and restart the timer.
738 * opening a block thaws the queue,restarts timer
739 * Thawing/timer-refresh is a side effect.
741 prb_open_block(pkc
, pbd
);
748 _prb_refresh_rx_retire_blk_timer(pkc
);
751 spin_unlock(&po
->sk
.sk_receive_queue
.lock
);
754 static void prb_flush_block(struct tpacket_kbdq_core
*pkc1
,
755 struct tpacket_block_desc
*pbd1
, __u32 status
)
757 /* Flush everything minus the block header */
759 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
764 /* Skip the block header(we know header WILL fit in 4K) */
767 end
= (u8
*)PAGE_ALIGN((unsigned long)pkc1
->pkblk_end
);
768 for (; start
< end
; start
+= PAGE_SIZE
)
769 flush_dcache_page(pgv_to_page(start
));
774 /* Now update the block status. */
776 BLOCK_STATUS(pbd1
) = status
;
778 /* Flush the block header */
780 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
782 flush_dcache_page(pgv_to_page(start
));
792 * 2) Increment active_blk_num
794 * Note:We DONT refresh the timer on purpose.
795 * Because almost always the next block will be opened.
797 static void prb_close_block(struct tpacket_kbdq_core
*pkc1
,
798 struct tpacket_block_desc
*pbd1
,
799 struct packet_sock
*po
, unsigned int stat
)
801 __u32 status
= TP_STATUS_USER
| stat
;
803 struct tpacket3_hdr
*last_pkt
;
804 struct tpacket_hdr_v1
*h1
= &pbd1
->hdr
.bh1
;
805 struct sock
*sk
= &po
->sk
;
807 if (atomic_read(&po
->tp_drops
))
808 status
|= TP_STATUS_LOSING
;
810 last_pkt
= (struct tpacket3_hdr
*)pkc1
->prev
;
811 last_pkt
->tp_next_offset
= 0;
813 /* Get the ts of the last pkt */
814 if (BLOCK_NUM_PKTS(pbd1
)) {
815 h1
->ts_last_pkt
.ts_sec
= last_pkt
->tp_sec
;
816 h1
->ts_last_pkt
.ts_nsec
= last_pkt
->tp_nsec
;
818 /* Ok, we tmo'd - so get the current time.
820 * It shouldn't really happen as we don't close empty
821 * blocks. See prb_retire_rx_blk_timer_expired().
823 struct timespec64 ts
;
824 ktime_get_real_ts64(&ts
);
825 h1
->ts_last_pkt
.ts_sec
= ts
.tv_sec
;
826 h1
->ts_last_pkt
.ts_nsec
= ts
.tv_nsec
;
831 /* Flush the block */
832 prb_flush_block(pkc1
, pbd1
, status
);
834 sk
->sk_data_ready(sk
);
836 pkc1
->kactive_blk_num
= GET_NEXT_PRB_BLK_NUM(pkc1
);
839 static void prb_thaw_queue(struct tpacket_kbdq_core
*pkc
)
841 pkc
->reset_pending_on_curr_blk
= 0;
845 * Side effect of opening a block:
847 * 1) prb_queue is thawed.
848 * 2) retire_blk_timer is refreshed.
851 static void prb_open_block(struct tpacket_kbdq_core
*pkc1
,
852 struct tpacket_block_desc
*pbd1
)
854 struct timespec64 ts
;
855 struct tpacket_hdr_v1
*h1
= &pbd1
->hdr
.bh1
;
859 /* We could have just memset this but we will lose the
860 * flexibility of making the priv area sticky
863 BLOCK_SNUM(pbd1
) = pkc1
->knxt_seq_num
++;
864 BLOCK_NUM_PKTS(pbd1
) = 0;
865 BLOCK_LEN(pbd1
) = BLK_PLUS_PRIV(pkc1
->blk_sizeof_priv
);
867 ktime_get_real_ts64(&ts
);
869 h1
->ts_first_pkt
.ts_sec
= ts
.tv_sec
;
870 h1
->ts_first_pkt
.ts_nsec
= ts
.tv_nsec
;
872 pkc1
->pkblk_start
= (char *)pbd1
;
873 pkc1
->nxt_offset
= pkc1
->pkblk_start
+ BLK_PLUS_PRIV(pkc1
->blk_sizeof_priv
);
875 BLOCK_O2FP(pbd1
) = (__u32
)BLK_PLUS_PRIV(pkc1
->blk_sizeof_priv
);
876 BLOCK_O2PRIV(pbd1
) = BLK_HDR_LEN
;
878 pbd1
->version
= pkc1
->version
;
879 pkc1
->prev
= pkc1
->nxt_offset
;
880 pkc1
->pkblk_end
= pkc1
->pkblk_start
+ pkc1
->kblk_size
;
882 prb_thaw_queue(pkc1
);
883 _prb_refresh_rx_retire_blk_timer(pkc1
);
889 * Queue freeze logic:
890 * 1) Assume tp_block_nr = 8 blocks.
891 * 2) At time 't0', user opens Rx ring.
892 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
893 * 4) user-space is either sleeping or processing block '0'.
894 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
895 * it will close block-7,loop around and try to fill block '0'.
897 * __packet_lookup_frame_in_block
898 * prb_retire_current_block()
899 * prb_dispatch_next_block()
900 * |->(BLOCK_STATUS == USER) evaluates to true
901 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
902 * 6) Now there are two cases:
903 * 6.1) Link goes idle right after the queue is frozen.
904 * But remember, the last open_block() refreshed the timer.
905 * When this timer expires,it will refresh itself so that we can
906 * re-open block-0 in near future.
907 * 6.2) Link is busy and keeps on receiving packets. This is a simple
908 * case and __packet_lookup_frame_in_block will check if block-0
909 * is free and can now be re-used.
911 static void prb_freeze_queue(struct tpacket_kbdq_core
*pkc
,
912 struct packet_sock
*po
)
914 pkc
->reset_pending_on_curr_blk
= 1;
915 po
->stats
.stats3
.tp_freeze_q_cnt
++;
918 #define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
921 * If the next block is free then we will dispatch it
922 * and return a good offset.
923 * Else, we will freeze the queue.
924 * So, caller must check the return value.
926 static void *prb_dispatch_next_block(struct tpacket_kbdq_core
*pkc
,
927 struct packet_sock
*po
)
929 struct tpacket_block_desc
*pbd
;
933 /* 1. Get current block num */
934 pbd
= GET_CURR_PBLOCK_DESC_FROM_CORE(pkc
);
936 /* 2. If this block is currently in_use then freeze the queue */
937 if (TP_STATUS_USER
& BLOCK_STATUS(pbd
)) {
938 prb_freeze_queue(pkc
, po
);
944 * open this block and return the offset where the first packet
945 * needs to get stored.
947 prb_open_block(pkc
, pbd
);
948 return (void *)pkc
->nxt_offset
;
951 static void prb_retire_current_block(struct tpacket_kbdq_core
*pkc
,
952 struct packet_sock
*po
, unsigned int status
)
954 struct tpacket_block_desc
*pbd
= GET_CURR_PBLOCK_DESC_FROM_CORE(pkc
);
956 /* retire/close the current block */
957 if (likely(TP_STATUS_KERNEL
== BLOCK_STATUS(pbd
))) {
959 * Plug the case where copy_bits() is in progress on
960 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
961 * have space to copy the pkt in the current block and
962 * called prb_retire_current_block()
964 * We don't need to worry about the TMO case because
965 * the timer-handler already handled this case.
967 if (!(status
& TP_STATUS_BLK_TMO
)) {
968 /* Waiting for skb_copy_bits to finish... */
969 write_lock(&pkc
->blk_fill_in_prog_lock
);
970 write_unlock(&pkc
->blk_fill_in_prog_lock
);
972 prb_close_block(pkc
, pbd
, po
, status
);
977 static int prb_curr_blk_in_use(struct tpacket_block_desc
*pbd
)
979 return TP_STATUS_USER
& BLOCK_STATUS(pbd
);
982 static int prb_queue_frozen(struct tpacket_kbdq_core
*pkc
)
984 return pkc
->reset_pending_on_curr_blk
;
987 static void prb_clear_blk_fill_status(struct packet_ring_buffer
*rb
)
988 __releases(&pkc
->blk_fill_in_prog_lock
)
990 struct tpacket_kbdq_core
*pkc
= GET_PBDQC_FROM_RB(rb
);
992 read_unlock(&pkc
->blk_fill_in_prog_lock
);
995 static void prb_fill_rxhash(struct tpacket_kbdq_core
*pkc
,
996 struct tpacket3_hdr
*ppd
)
998 ppd
->hv1
.tp_rxhash
= skb_get_hash(pkc
->skb
);
1001 static void prb_clear_rxhash(struct tpacket_kbdq_core
*pkc
,
1002 struct tpacket3_hdr
*ppd
)
1004 ppd
->hv1
.tp_rxhash
= 0;
1007 static void prb_fill_vlan_info(struct tpacket_kbdq_core
*pkc
,
1008 struct tpacket3_hdr
*ppd
)
1010 if (skb_vlan_tag_present(pkc
->skb
)) {
1011 ppd
->hv1
.tp_vlan_tci
= skb_vlan_tag_get(pkc
->skb
);
1012 ppd
->hv1
.tp_vlan_tpid
= ntohs(pkc
->skb
->vlan_proto
);
1013 ppd
->tp_status
= TP_STATUS_VLAN_VALID
| TP_STATUS_VLAN_TPID_VALID
;
1015 ppd
->hv1
.tp_vlan_tci
= 0;
1016 ppd
->hv1
.tp_vlan_tpid
= 0;
1017 ppd
->tp_status
= TP_STATUS_AVAILABLE
;
1021 static void prb_run_all_ft_ops(struct tpacket_kbdq_core
*pkc
,
1022 struct tpacket3_hdr
*ppd
)
1024 ppd
->hv1
.tp_padding
= 0;
1025 prb_fill_vlan_info(pkc
, ppd
);
1027 if (pkc
->feature_req_word
& TP_FT_REQ_FILL_RXHASH
)
1028 prb_fill_rxhash(pkc
, ppd
);
1030 prb_clear_rxhash(pkc
, ppd
);
1033 static void prb_fill_curr_block(char *curr
,
1034 struct tpacket_kbdq_core
*pkc
,
1035 struct tpacket_block_desc
*pbd
,
1037 __acquires(&pkc
->blk_fill_in_prog_lock
)
1039 struct tpacket3_hdr
*ppd
;
1041 ppd
= (struct tpacket3_hdr
*)curr
;
1042 ppd
->tp_next_offset
= TOTAL_PKT_LEN_INCL_ALIGN(len
);
1044 pkc
->nxt_offset
+= TOTAL_PKT_LEN_INCL_ALIGN(len
);
1045 BLOCK_LEN(pbd
) += TOTAL_PKT_LEN_INCL_ALIGN(len
);
1046 BLOCK_NUM_PKTS(pbd
) += 1;
1047 read_lock(&pkc
->blk_fill_in_prog_lock
);
1048 prb_run_all_ft_ops(pkc
, ppd
);
1051 /* Assumes caller has the sk->rx_queue.lock */
1052 static void *__packet_lookup_frame_in_block(struct packet_sock
*po
,
1053 struct sk_buff
*skb
,
1057 struct tpacket_kbdq_core
*pkc
;
1058 struct tpacket_block_desc
*pbd
;
1061 pkc
= GET_PBDQC_FROM_RB(&po
->rx_ring
);
1062 pbd
= GET_CURR_PBLOCK_DESC_FROM_CORE(pkc
);
1064 /* Queue is frozen when user space is lagging behind */
1065 if (prb_queue_frozen(pkc
)) {
1067 * Check if that last block which caused the queue to freeze,
1068 * is still in_use by user-space.
1070 if (prb_curr_blk_in_use(pbd
)) {
1071 /* Can't record this packet */
1075 * Ok, the block was released by user-space.
1076 * Now let's open that block.
1077 * opening a block also thaws the queue.
1078 * Thawing is a side effect.
1080 prb_open_block(pkc
, pbd
);
1085 curr
= pkc
->nxt_offset
;
1087 end
= (char *)pbd
+ pkc
->kblk_size
;
1089 /* first try the current block */
1090 if (curr
+TOTAL_PKT_LEN_INCL_ALIGN(len
) < end
) {
1091 prb_fill_curr_block(curr
, pkc
, pbd
, len
);
1092 return (void *)curr
;
1095 /* Ok, close the current block */
1096 prb_retire_current_block(pkc
, po
, 0);
1098 /* Now, try to dispatch the next block */
1099 curr
= (char *)prb_dispatch_next_block(pkc
, po
);
1101 pbd
= GET_CURR_PBLOCK_DESC_FROM_CORE(pkc
);
1102 prb_fill_curr_block(curr
, pkc
, pbd
, len
);
1103 return (void *)curr
;
1107 * No free blocks are available.user_space hasn't caught up yet.
1108 * Queue was just frozen and now this packet will get dropped.
1113 static void *packet_current_rx_frame(struct packet_sock
*po
,
1114 struct sk_buff
*skb
,
1115 int status
, unsigned int len
)
1118 switch (po
->tp_version
) {
1121 curr
= packet_lookup_frame(po
, &po
->rx_ring
,
1122 po
->rx_ring
.head
, status
);
1125 return __packet_lookup_frame_in_block(po
, skb
, len
);
1127 WARN(1, "TPACKET version not supported\n");
1133 static void *prb_lookup_block(const struct packet_sock
*po
,
1134 const struct packet_ring_buffer
*rb
,
1138 struct tpacket_kbdq_core
*pkc
= GET_PBDQC_FROM_RB(rb
);
1139 struct tpacket_block_desc
*pbd
= GET_PBLOCK_DESC(pkc
, idx
);
1141 if (status
!= BLOCK_STATUS(pbd
))
1146 static int prb_previous_blk_num(struct packet_ring_buffer
*rb
)
1149 if (rb
->prb_bdqc
.kactive_blk_num
)
1150 prev
= rb
->prb_bdqc
.kactive_blk_num
-1;
1152 prev
= rb
->prb_bdqc
.knum_blocks
-1;
1156 /* Assumes caller has held the rx_queue.lock */
1157 static void *__prb_previous_block(struct packet_sock
*po
,
1158 struct packet_ring_buffer
*rb
,
1161 unsigned int previous
= prb_previous_blk_num(rb
);
1162 return prb_lookup_block(po
, rb
, previous
, status
);
1165 static void *packet_previous_rx_frame(struct packet_sock
*po
,
1166 struct packet_ring_buffer
*rb
,
1169 if (po
->tp_version
<= TPACKET_V2
)
1170 return packet_previous_frame(po
, rb
, status
);
1172 return __prb_previous_block(po
, rb
, status
);
1175 static void packet_increment_rx_head(struct packet_sock
*po
,
1176 struct packet_ring_buffer
*rb
)
1178 switch (po
->tp_version
) {
1181 return packet_increment_head(rb
);
1184 WARN(1, "TPACKET version not supported.\n");
1190 static void *packet_previous_frame(struct packet_sock
*po
,
1191 struct packet_ring_buffer
*rb
,
1194 unsigned int previous
= rb
->head
? rb
->head
- 1 : rb
->frame_max
;
1195 return packet_lookup_frame(po
, rb
, previous
, status
);
1198 static void packet_increment_head(struct packet_ring_buffer
*buff
)
1200 buff
->head
= buff
->head
!= buff
->frame_max
? buff
->head
+1 : 0;
1203 static void packet_inc_pending(struct packet_ring_buffer
*rb
)
1205 this_cpu_inc(*rb
->pending_refcnt
);
1208 static void packet_dec_pending(struct packet_ring_buffer
*rb
)
1210 this_cpu_dec(*rb
->pending_refcnt
);
1213 static unsigned int packet_read_pending(const struct packet_ring_buffer
*rb
)
1215 unsigned int refcnt
= 0;
1218 /* We don't use pending refcount in rx_ring. */
1219 if (rb
->pending_refcnt
== NULL
)
1222 for_each_possible_cpu(cpu
)
1223 refcnt
+= *per_cpu_ptr(rb
->pending_refcnt
, cpu
);
1228 static int packet_alloc_pending(struct packet_sock
*po
)
1230 po
->rx_ring
.pending_refcnt
= NULL
;
1232 po
->tx_ring
.pending_refcnt
= alloc_percpu(unsigned int);
1233 if (unlikely(po
->tx_ring
.pending_refcnt
== NULL
))
1239 static void packet_free_pending(struct packet_sock
*po
)
1241 free_percpu(po
->tx_ring
.pending_refcnt
);
1244 #define ROOM_POW_OFF 2
1245 #define ROOM_NONE 0x0
1246 #define ROOM_LOW 0x1
1247 #define ROOM_NORMAL 0x2
1249 static bool __tpacket_has_room(const struct packet_sock
*po
, int pow_off
)
1253 len
= READ_ONCE(po
->rx_ring
.frame_max
) + 1;
1254 idx
= READ_ONCE(po
->rx_ring
.head
);
1256 idx
+= len
>> pow_off
;
1259 return packet_lookup_frame(po
, &po
->rx_ring
, idx
, TP_STATUS_KERNEL
);
1262 static bool __tpacket_v3_has_room(const struct packet_sock
*po
, int pow_off
)
1266 len
= READ_ONCE(po
->rx_ring
.prb_bdqc
.knum_blocks
);
1267 idx
= READ_ONCE(po
->rx_ring
.prb_bdqc
.kactive_blk_num
);
1269 idx
+= len
>> pow_off
;
1272 return prb_lookup_block(po
, &po
->rx_ring
, idx
, TP_STATUS_KERNEL
);
1275 static int __packet_rcv_has_room(const struct packet_sock
*po
,
1276 const struct sk_buff
*skb
)
1278 const struct sock
*sk
= &po
->sk
;
1279 int ret
= ROOM_NONE
;
1281 if (po
->prot_hook
.func
!= tpacket_rcv
) {
1282 int rcvbuf
= READ_ONCE(sk
->sk_rcvbuf
);
1283 int avail
= rcvbuf
- atomic_read(&sk
->sk_rmem_alloc
)
1284 - (skb
? skb
->truesize
: 0);
1286 if (avail
> (rcvbuf
>> ROOM_POW_OFF
))
1294 if (po
->tp_version
== TPACKET_V3
) {
1295 if (__tpacket_v3_has_room(po
, ROOM_POW_OFF
))
1297 else if (__tpacket_v3_has_room(po
, 0))
1300 if (__tpacket_has_room(po
, ROOM_POW_OFF
))
1302 else if (__tpacket_has_room(po
, 0))
1309 static int packet_rcv_has_room(struct packet_sock
*po
, struct sk_buff
*skb
)
1314 ret
= __packet_rcv_has_room(po
, skb
);
1315 pressure
= ret
!= ROOM_NORMAL
;
1317 if (packet_sock_flag(po
, PACKET_SOCK_PRESSURE
) != pressure
)
1318 packet_sock_flag_set(po
, PACKET_SOCK_PRESSURE
, pressure
);
1323 static void packet_rcv_try_clear_pressure(struct packet_sock
*po
)
1325 if (packet_sock_flag(po
, PACKET_SOCK_PRESSURE
) &&
1326 __packet_rcv_has_room(po
, NULL
) == ROOM_NORMAL
)
1327 packet_sock_flag_set(po
, PACKET_SOCK_PRESSURE
, false);
1330 static void packet_sock_destruct(struct sock
*sk
)
1332 skb_queue_purge(&sk
->sk_error_queue
);
1334 WARN_ON(atomic_read(&sk
->sk_rmem_alloc
));
1335 WARN_ON(refcount_read(&sk
->sk_wmem_alloc
));
1337 if (!sock_flag(sk
, SOCK_DEAD
)) {
1338 pr_err("Attempt to release alive packet socket: %p\n", sk
);
1343 static bool fanout_flow_is_huge(struct packet_sock
*po
, struct sk_buff
*skb
)
1345 u32
*history
= po
->rollover
->history
;
1349 rxhash
= skb_get_hash(skb
);
1350 for (i
= 0; i
< ROLLOVER_HLEN
; i
++)
1351 if (READ_ONCE(history
[i
]) == rxhash
)
1354 victim
= get_random_u32_below(ROLLOVER_HLEN
);
1356 /* Avoid dirtying the cache line if possible */
1357 if (READ_ONCE(history
[victim
]) != rxhash
)
1358 WRITE_ONCE(history
[victim
], rxhash
);
1360 return count
> (ROLLOVER_HLEN
>> 1);
1363 static unsigned int fanout_demux_hash(struct packet_fanout
*f
,
1364 struct sk_buff
*skb
,
1367 return reciprocal_scale(__skb_get_hash_symmetric(skb
), num
);
1370 static unsigned int fanout_demux_lb(struct packet_fanout
*f
,
1371 struct sk_buff
*skb
,
1374 unsigned int val
= atomic_inc_return(&f
->rr_cur
);
1379 static unsigned int fanout_demux_cpu(struct packet_fanout
*f
,
1380 struct sk_buff
*skb
,
1383 return smp_processor_id() % num
;
1386 static unsigned int fanout_demux_rnd(struct packet_fanout
*f
,
1387 struct sk_buff
*skb
,
1390 return get_random_u32_below(num
);
1393 static unsigned int fanout_demux_rollover(struct packet_fanout
*f
,
1394 struct sk_buff
*skb
,
1395 unsigned int idx
, bool try_self
,
1398 struct packet_sock
*po
, *po_next
, *po_skip
= NULL
;
1399 unsigned int i
, j
, room
= ROOM_NONE
;
1401 po
= pkt_sk(rcu_dereference(f
->arr
[idx
]));
1404 room
= packet_rcv_has_room(po
, skb
);
1405 if (room
== ROOM_NORMAL
||
1406 (room
== ROOM_LOW
&& !fanout_flow_is_huge(po
, skb
)))
1411 i
= j
= min_t(int, po
->rollover
->sock
, num
- 1);
1413 po_next
= pkt_sk(rcu_dereference(f
->arr
[i
]));
1414 if (po_next
!= po_skip
&&
1415 !packet_sock_flag(po_next
, PACKET_SOCK_PRESSURE
) &&
1416 packet_rcv_has_room(po_next
, skb
) == ROOM_NORMAL
) {
1418 po
->rollover
->sock
= i
;
1419 atomic_long_inc(&po
->rollover
->num
);
1420 if (room
== ROOM_LOW
)
1421 atomic_long_inc(&po
->rollover
->num_huge
);
1429 atomic_long_inc(&po
->rollover
->num_failed
);
1433 static unsigned int fanout_demux_qm(struct packet_fanout
*f
,
1434 struct sk_buff
*skb
,
1437 return skb_get_queue_mapping(skb
) % num
;
1440 static unsigned int fanout_demux_bpf(struct packet_fanout
*f
,
1441 struct sk_buff
*skb
,
1444 struct bpf_prog
*prog
;
1445 unsigned int ret
= 0;
1448 prog
= rcu_dereference(f
->bpf_prog
);
1450 ret
= bpf_prog_run_clear_cb(prog
, skb
) % num
;
1456 static bool fanout_has_flag(struct packet_fanout
*f
, u16 flag
)
1458 return f
->flags
& (flag
>> 8);
1461 static int packet_rcv_fanout(struct sk_buff
*skb
, struct net_device
*dev
,
1462 struct packet_type
*pt
, struct net_device
*orig_dev
)
1464 struct packet_fanout
*f
= pt
->af_packet_priv
;
1465 unsigned int num
= READ_ONCE(f
->num_members
);
1466 struct net
*net
= read_pnet(&f
->net
);
1467 struct packet_sock
*po
;
1470 if (!net_eq(dev_net(dev
), net
) || !num
) {
1475 if (fanout_has_flag(f
, PACKET_FANOUT_FLAG_DEFRAG
)) {
1476 skb
= ip_check_defrag(net
, skb
, IP_DEFRAG_AF_PACKET
);
1481 case PACKET_FANOUT_HASH
:
1483 idx
= fanout_demux_hash(f
, skb
, num
);
1485 case PACKET_FANOUT_LB
:
1486 idx
= fanout_demux_lb(f
, skb
, num
);
1488 case PACKET_FANOUT_CPU
:
1489 idx
= fanout_demux_cpu(f
, skb
, num
);
1491 case PACKET_FANOUT_RND
:
1492 idx
= fanout_demux_rnd(f
, skb
, num
);
1494 case PACKET_FANOUT_QM
:
1495 idx
= fanout_demux_qm(f
, skb
, num
);
1497 case PACKET_FANOUT_ROLLOVER
:
1498 idx
= fanout_demux_rollover(f
, skb
, 0, false, num
);
1500 case PACKET_FANOUT_CBPF
:
1501 case PACKET_FANOUT_EBPF
:
1502 idx
= fanout_demux_bpf(f
, skb
, num
);
1506 if (fanout_has_flag(f
, PACKET_FANOUT_FLAG_ROLLOVER
))
1507 idx
= fanout_demux_rollover(f
, skb
, idx
, true, num
);
1509 po
= pkt_sk(rcu_dereference(f
->arr
[idx
]));
1510 return po
->prot_hook
.func(skb
, dev
, &po
->prot_hook
, orig_dev
);
1513 DEFINE_MUTEX(fanout_mutex
);
1514 EXPORT_SYMBOL_GPL(fanout_mutex
);
1515 static LIST_HEAD(fanout_list
);
1516 static u16 fanout_next_id
;
1518 static void __fanout_link(struct sock
*sk
, struct packet_sock
*po
)
1520 struct packet_fanout
*f
= po
->fanout
;
1522 spin_lock(&f
->lock
);
1523 rcu_assign_pointer(f
->arr
[f
->num_members
], sk
);
1526 if (f
->num_members
== 1)
1527 dev_add_pack(&f
->prot_hook
);
1528 spin_unlock(&f
->lock
);
1531 static void __fanout_unlink(struct sock
*sk
, struct packet_sock
*po
)
1533 struct packet_fanout
*f
= po
->fanout
;
1536 spin_lock(&f
->lock
);
1537 for (i
= 0; i
< f
->num_members
; i
++) {
1538 if (rcu_dereference_protected(f
->arr
[i
],
1539 lockdep_is_held(&f
->lock
)) == sk
)
1542 BUG_ON(i
>= f
->num_members
);
1543 rcu_assign_pointer(f
->arr
[i
],
1544 rcu_dereference_protected(f
->arr
[f
->num_members
- 1],
1545 lockdep_is_held(&f
->lock
)));
1547 if (f
->num_members
== 0)
1548 __dev_remove_pack(&f
->prot_hook
);
1549 spin_unlock(&f
->lock
);
1552 static bool match_fanout_group(struct packet_type
*ptype
, struct sock
*sk
)
1554 if (sk
->sk_family
!= PF_PACKET
)
1557 return ptype
->af_packet_priv
== pkt_sk(sk
)->fanout
;
1560 static void fanout_init_data(struct packet_fanout
*f
)
1563 case PACKET_FANOUT_LB
:
1564 atomic_set(&f
->rr_cur
, 0);
1566 case PACKET_FANOUT_CBPF
:
1567 case PACKET_FANOUT_EBPF
:
1568 RCU_INIT_POINTER(f
->bpf_prog
, NULL
);
1573 static void __fanout_set_data_bpf(struct packet_fanout
*f
, struct bpf_prog
*new)
1575 struct bpf_prog
*old
;
1577 spin_lock(&f
->lock
);
1578 old
= rcu_dereference_protected(f
->bpf_prog
, lockdep_is_held(&f
->lock
));
1579 rcu_assign_pointer(f
->bpf_prog
, new);
1580 spin_unlock(&f
->lock
);
1584 bpf_prog_destroy(old
);
1588 static int fanout_set_data_cbpf(struct packet_sock
*po
, sockptr_t data
,
1591 struct bpf_prog
*new;
1592 struct sock_fprog fprog
;
1595 if (sock_flag(&po
->sk
, SOCK_FILTER_LOCKED
))
1598 ret
= copy_bpf_fprog_from_user(&fprog
, data
, len
);
1602 ret
= bpf_prog_create_from_user(&new, &fprog
, NULL
, false);
1606 __fanout_set_data_bpf(po
->fanout
, new);
1610 static int fanout_set_data_ebpf(struct packet_sock
*po
, sockptr_t data
,
1613 struct bpf_prog
*new;
1616 if (sock_flag(&po
->sk
, SOCK_FILTER_LOCKED
))
1618 if (len
!= sizeof(fd
))
1620 if (copy_from_sockptr(&fd
, data
, len
))
1623 new = bpf_prog_get_type(fd
, BPF_PROG_TYPE_SOCKET_FILTER
);
1625 return PTR_ERR(new);
1627 __fanout_set_data_bpf(po
->fanout
, new);
1631 static int fanout_set_data(struct packet_sock
*po
, sockptr_t data
,
1634 switch (po
->fanout
->type
) {
1635 case PACKET_FANOUT_CBPF
:
1636 return fanout_set_data_cbpf(po
, data
, len
);
1637 case PACKET_FANOUT_EBPF
:
1638 return fanout_set_data_ebpf(po
, data
, len
);
1644 static void fanout_release_data(struct packet_fanout
*f
)
1647 case PACKET_FANOUT_CBPF
:
1648 case PACKET_FANOUT_EBPF
:
1649 __fanout_set_data_bpf(f
, NULL
);
1653 static bool __fanout_id_is_free(struct sock
*sk
, u16 candidate_id
)
1655 struct packet_fanout
*f
;
1657 list_for_each_entry(f
, &fanout_list
, list
) {
1658 if (f
->id
== candidate_id
&&
1659 read_pnet(&f
->net
) == sock_net(sk
)) {
1666 static bool fanout_find_new_id(struct sock
*sk
, u16
*new_id
)
1668 u16 id
= fanout_next_id
;
1671 if (__fanout_id_is_free(sk
, id
)) {
1673 fanout_next_id
= id
+ 1;
1678 } while (id
!= fanout_next_id
);
1683 static int fanout_add(struct sock
*sk
, struct fanout_args
*args
)
1685 struct packet_rollover
*rollover
= NULL
;
1686 struct packet_sock
*po
= pkt_sk(sk
);
1687 u16 type_flags
= args
->type_flags
;
1688 struct packet_fanout
*f
, *match
;
1689 u8 type
= type_flags
& 0xff;
1690 u8 flags
= type_flags
>> 8;
1695 case PACKET_FANOUT_ROLLOVER
:
1696 if (type_flags
& PACKET_FANOUT_FLAG_ROLLOVER
)
1699 case PACKET_FANOUT_HASH
:
1700 case PACKET_FANOUT_LB
:
1701 case PACKET_FANOUT_CPU
:
1702 case PACKET_FANOUT_RND
:
1703 case PACKET_FANOUT_QM
:
1704 case PACKET_FANOUT_CBPF
:
1705 case PACKET_FANOUT_EBPF
:
1711 mutex_lock(&fanout_mutex
);
1717 if (type
== PACKET_FANOUT_ROLLOVER
||
1718 (type_flags
& PACKET_FANOUT_FLAG_ROLLOVER
)) {
1720 rollover
= kzalloc(sizeof(*rollover
), GFP_KERNEL
);
1723 atomic_long_set(&rollover
->num
, 0);
1724 atomic_long_set(&rollover
->num_huge
, 0);
1725 atomic_long_set(&rollover
->num_failed
, 0);
1728 if (type_flags
& PACKET_FANOUT_FLAG_UNIQUEID
) {
1733 if (!fanout_find_new_id(sk
, &id
)) {
1737 /* ephemeral flag for the first socket in the group: drop it */
1738 flags
&= ~(PACKET_FANOUT_FLAG_UNIQUEID
>> 8);
1742 list_for_each_entry(f
, &fanout_list
, list
) {
1744 read_pnet(&f
->net
) == sock_net(sk
)) {
1751 if (match
->flags
!= flags
)
1753 if (args
->max_num_members
&&
1754 args
->max_num_members
!= match
->max_num_members
)
1757 if (args
->max_num_members
> PACKET_FANOUT_MAX
)
1759 if (!args
->max_num_members
)
1760 /* legacy PACKET_FANOUT_MAX */
1761 args
->max_num_members
= 256;
1763 match
= kvzalloc(struct_size(match
, arr
, args
->max_num_members
),
1767 write_pnet(&match
->net
, sock_net(sk
));
1770 match
->flags
= flags
;
1771 INIT_LIST_HEAD(&match
->list
);
1772 spin_lock_init(&match
->lock
);
1773 refcount_set(&match
->sk_ref
, 0);
1774 fanout_init_data(match
);
1775 match
->prot_hook
.type
= po
->prot_hook
.type
;
1776 match
->prot_hook
.dev
= po
->prot_hook
.dev
;
1777 match
->prot_hook
.func
= packet_rcv_fanout
;
1778 match
->prot_hook
.af_packet_priv
= match
;
1779 match
->prot_hook
.af_packet_net
= read_pnet(&match
->net
);
1780 match
->prot_hook
.id_match
= match_fanout_group
;
1781 match
->max_num_members
= args
->max_num_members
;
1782 match
->prot_hook
.ignore_outgoing
= type_flags
& PACKET_FANOUT_FLAG_IGNORE_OUTGOING
;
1783 list_add(&match
->list
, &fanout_list
);
1787 spin_lock(&po
->bind_lock
);
1788 if (packet_sock_flag(po
, PACKET_SOCK_RUNNING
) &&
1789 match
->type
== type
&&
1790 match
->prot_hook
.type
== po
->prot_hook
.type
&&
1791 match
->prot_hook
.dev
== po
->prot_hook
.dev
) {
1793 if (refcount_read(&match
->sk_ref
) < match
->max_num_members
) {
1794 __dev_remove_pack(&po
->prot_hook
);
1796 /* Paired with packet_setsockopt(PACKET_FANOUT_DATA) */
1797 WRITE_ONCE(po
->fanout
, match
);
1799 po
->rollover
= rollover
;
1801 refcount_set(&match
->sk_ref
, refcount_read(&match
->sk_ref
) + 1);
1802 __fanout_link(sk
, po
);
1806 spin_unlock(&po
->bind_lock
);
1808 if (err
&& !refcount_read(&match
->sk_ref
)) {
1809 list_del(&match
->list
);
1815 mutex_unlock(&fanout_mutex
);
1819 /* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
1820 * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
1821 * It is the responsibility of the caller to call fanout_release_data() and
1822 * free the returned packet_fanout (after synchronize_net())
1824 static struct packet_fanout
*fanout_release(struct sock
*sk
)
1826 struct packet_sock
*po
= pkt_sk(sk
);
1827 struct packet_fanout
*f
;
1829 mutex_lock(&fanout_mutex
);
1834 if (refcount_dec_and_test(&f
->sk_ref
))
1839 mutex_unlock(&fanout_mutex
);
1844 static bool packet_extra_vlan_len_allowed(const struct net_device
*dev
,
1845 struct sk_buff
*skb
)
1847 /* Earlier code assumed this would be a VLAN pkt, double-check
1848 * this now that we have the actual packet in hand. We can only
1849 * do this check on Ethernet devices.
1851 if (unlikely(dev
->type
!= ARPHRD_ETHER
))
1854 skb_reset_mac_header(skb
);
1855 return likely(eth_hdr(skb
)->h_proto
== htons(ETH_P_8021Q
));
1858 static const struct proto_ops packet_ops
;
1860 static const struct proto_ops packet_ops_spkt
;
1862 static int packet_rcv_spkt(struct sk_buff
*skb
, struct net_device
*dev
,
1863 struct packet_type
*pt
, struct net_device
*orig_dev
)
1866 struct sockaddr_pkt
*spkt
;
1869 * When we registered the protocol we saved the socket in the data
1870 * field for just this event.
1873 sk
= pt
->af_packet_priv
;
1876 * Yank back the headers [hope the device set this
1877 * right or kerboom...]
1879 * Incoming packets have ll header pulled,
1882 * For outgoing ones skb->data == skb_mac_header(skb)
1883 * so that this procedure is noop.
1886 if (skb
->pkt_type
== PACKET_LOOPBACK
)
1889 if (!net_eq(dev_net(dev
), sock_net(sk
)))
1892 skb
= skb_share_check(skb
, GFP_ATOMIC
);
1896 /* drop any routing info */
1899 /* drop conntrack reference */
1902 spkt
= &PACKET_SKB_CB(skb
)->sa
.pkt
;
1904 skb_push(skb
, skb
->data
- skb_mac_header(skb
));
1907 * The SOCK_PACKET socket receives _all_ frames.
1910 spkt
->spkt_family
= dev
->type
;
1911 strscpy(spkt
->spkt_device
, dev
->name
, sizeof(spkt
->spkt_device
));
1912 spkt
->spkt_protocol
= skb
->protocol
;
1915 * Charge the memory to the socket. This is done specifically
1916 * to prevent sockets using all the memory up.
1919 if (sock_queue_rcv_skb(sk
, skb
) == 0)
1928 static void packet_parse_headers(struct sk_buff
*skb
, struct socket
*sock
)
1932 if ((!skb
->protocol
|| skb
->protocol
== htons(ETH_P_ALL
)) &&
1933 sock
->type
== SOCK_RAW
) {
1934 skb_reset_mac_header(skb
);
1935 skb
->protocol
= dev_parse_header_protocol(skb
);
1938 /* Move network header to the right position for VLAN tagged packets */
1939 if (likely(skb
->dev
->type
== ARPHRD_ETHER
) &&
1940 eth_type_vlan(skb
->protocol
) &&
1941 vlan_get_protocol_and_depth(skb
, skb
->protocol
, &depth
) != 0)
1942 skb_set_network_header(skb
, depth
);
1944 skb_probe_transport_header(skb
);
1948 * Output a raw packet to a device layer. This bypasses all the other
1949 * protocol layers and you must therefore supply it with a complete frame
1952 static int packet_sendmsg_spkt(struct socket
*sock
, struct msghdr
*msg
,
1955 struct sock
*sk
= sock
->sk
;
1956 DECLARE_SOCKADDR(struct sockaddr_pkt
*, saddr
, msg
->msg_name
);
1957 struct sk_buff
*skb
= NULL
;
1958 struct net_device
*dev
;
1959 struct sockcm_cookie sockc
;
1965 * Get and verify the address.
1969 if (msg
->msg_namelen
< sizeof(struct sockaddr
))
1971 if (msg
->msg_namelen
== sizeof(struct sockaddr_pkt
))
1972 proto
= saddr
->spkt_protocol
;
1974 return -ENOTCONN
; /* SOCK_PACKET must be sent giving an address */
1977 * Find the device first to size check it
1980 saddr
->spkt_device
[sizeof(saddr
->spkt_device
) - 1] = 0;
1983 dev
= dev_get_by_name_rcu(sock_net(sk
), saddr
->spkt_device
);
1989 if (!(dev
->flags
& IFF_UP
))
1993 * You may not queue a frame bigger than the mtu. This is the lowest level
1994 * raw protocol and you must do your own fragmentation at this level.
1997 if (unlikely(sock_flag(sk
, SOCK_NOFCS
))) {
1998 if (!netif_supports_nofcs(dev
)) {
1999 err
= -EPROTONOSUPPORT
;
2002 extra_len
= 4; /* We're doing our own CRC */
2006 if (len
> dev
->mtu
+ dev
->hard_header_len
+ VLAN_HLEN
+ extra_len
)
2010 size_t reserved
= LL_RESERVED_SPACE(dev
);
2011 int tlen
= dev
->needed_tailroom
;
2012 unsigned int hhlen
= dev
->header_ops
? dev
->hard_header_len
: 0;
2015 skb
= sock_wmalloc(sk
, len
+ reserved
+ tlen
, 0, GFP_KERNEL
);
2018 /* FIXME: Save some space for broken drivers that write a hard
2019 * header at transmission time by themselves. PPP is the notable
2020 * one here. This should really be fixed at the driver level.
2022 skb_reserve(skb
, reserved
);
2023 skb_reset_network_header(skb
);
2025 /* Try to align data part correctly */
2030 skb_reset_network_header(skb
);
2032 err
= memcpy_from_msg(skb_put(skb
, len
), msg
, len
);
2038 if (!dev_validate_header(dev
, skb
->data
, len
) || !skb
->len
) {
2042 if (len
> (dev
->mtu
+ dev
->hard_header_len
+ extra_len
) &&
2043 !packet_extra_vlan_len_allowed(dev
, skb
)) {
2048 sockcm_init(&sockc
, sk
);
2049 if (msg
->msg_controllen
) {
2050 err
= sock_cmsg_send(sk
, msg
, &sockc
);
2055 skb
->protocol
= proto
;
2057 skb
->priority
= READ_ONCE(sk
->sk_priority
);
2058 skb
->mark
= READ_ONCE(sk
->sk_mark
);
2059 skb
->tstamp
= sockc
.transmit_time
;
2061 skb_setup_tx_timestamp(skb
, sockc
.tsflags
);
2063 if (unlikely(extra_len
== 4))
2066 packet_parse_headers(skb
, sock
);
2068 dev_queue_xmit(skb
);
2079 static unsigned int run_filter(struct sk_buff
*skb
,
2080 const struct sock
*sk
,
2083 struct sk_filter
*filter
;
2086 filter
= rcu_dereference(sk
->sk_filter
);
2088 res
= bpf_prog_run_clear_cb(filter
->prog
, skb
);
2094 static int packet_rcv_vnet(struct msghdr
*msg
, const struct sk_buff
*skb
,
2095 size_t *len
, int vnet_hdr_sz
)
2097 struct virtio_net_hdr_mrg_rxbuf vnet_hdr
= { .num_buffers
= 0 };
2099 if (*len
< vnet_hdr_sz
)
2101 *len
-= vnet_hdr_sz
;
2103 if (virtio_net_hdr_from_skb(skb
, (struct virtio_net_hdr
*)&vnet_hdr
, vio_le(), true, 0))
2106 return memcpy_to_msg(msg
, (void *)&vnet_hdr
, vnet_hdr_sz
);
2110 * This function makes lazy skb cloning in hope that most of packets
2111 * are discarded by BPF.
2113 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
2114 * and skb->cb are mangled. It works because (and until) packets
2115 * falling here are owned by current CPU. Output packets are cloned
2116 * by dev_queue_xmit_nit(), input packets are processed by net_bh
2117 * sequentially, so that if we return skb to original state on exit,
2118 * we will not harm anyone.
2121 static int packet_rcv(struct sk_buff
*skb
, struct net_device
*dev
,
2122 struct packet_type
*pt
, struct net_device
*orig_dev
)
2125 struct sockaddr_ll
*sll
;
2126 struct packet_sock
*po
;
2127 u8
*skb_head
= skb
->data
;
2128 int skb_len
= skb
->len
;
2129 unsigned int snaplen
, res
;
2130 bool is_drop_n_account
= false;
2132 if (skb
->pkt_type
== PACKET_LOOPBACK
)
2135 sk
= pt
->af_packet_priv
;
2138 if (!net_eq(dev_net(dev
), sock_net(sk
)))
2143 if (dev_has_header(dev
)) {
2144 /* The device has an explicit notion of ll header,
2145 * exported to higher levels.
2147 * Otherwise, the device hides details of its frame
2148 * structure, so that corresponding packet head is
2149 * never delivered to user.
2151 if (sk
->sk_type
!= SOCK_DGRAM
)
2152 skb_push(skb
, skb
->data
- skb_mac_header(skb
));
2153 else if (skb
->pkt_type
== PACKET_OUTGOING
) {
2154 /* Special case: outgoing packets have ll header at head */
2155 skb_pull(skb
, skb_network_offset(skb
));
2161 res
= run_filter(skb
, sk
, snaplen
);
2163 goto drop_n_restore
;
2167 if (atomic_read(&sk
->sk_rmem_alloc
) >= sk
->sk_rcvbuf
)
2170 if (skb_shared(skb
)) {
2171 struct sk_buff
*nskb
= skb_clone(skb
, GFP_ATOMIC
);
2175 if (skb_head
!= skb
->data
) {
2176 skb
->data
= skb_head
;
2183 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb
)) + MAX_ADDR_LEN
- 8);
2185 sll
= &PACKET_SKB_CB(skb
)->sa
.ll
;
2186 sll
->sll_hatype
= dev
->type
;
2187 sll
->sll_pkttype
= skb
->pkt_type
;
2188 if (unlikely(packet_sock_flag(po
, PACKET_SOCK_ORIGDEV
)))
2189 sll
->sll_ifindex
= orig_dev
->ifindex
;
2191 sll
->sll_ifindex
= dev
->ifindex
;
2193 sll
->sll_halen
= dev_parse_header(skb
, sll
->sll_addr
);
2195 /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
2196 * Use their space for storing the original skb length.
2198 PACKET_SKB_CB(skb
)->sa
.origlen
= skb
->len
;
2200 if (pskb_trim(skb
, snaplen
))
2203 skb_set_owner_r(skb
, sk
);
2207 /* drop conntrack reference */
2210 spin_lock(&sk
->sk_receive_queue
.lock
);
2211 po
->stats
.stats1
.tp_packets
++;
2212 sock_skb_set_dropcount(sk
, skb
);
2213 skb_clear_delivery_time(skb
);
2214 __skb_queue_tail(&sk
->sk_receive_queue
, skb
);
2215 spin_unlock(&sk
->sk_receive_queue
.lock
);
2216 sk
->sk_data_ready(sk
);
2220 is_drop_n_account
= true;
2221 atomic_inc(&po
->tp_drops
);
2222 atomic_inc(&sk
->sk_drops
);
2225 if (skb_head
!= skb
->data
&& skb_shared(skb
)) {
2226 skb
->data
= skb_head
;
2230 if (!is_drop_n_account
)
2237 static int tpacket_rcv(struct sk_buff
*skb
, struct net_device
*dev
,
2238 struct packet_type
*pt
, struct net_device
*orig_dev
)
2241 struct packet_sock
*po
;
2242 struct sockaddr_ll
*sll
;
2243 union tpacket_uhdr h
;
2244 u8
*skb_head
= skb
->data
;
2245 int skb_len
= skb
->len
;
2246 unsigned int snaplen
, res
;
2247 unsigned long status
= TP_STATUS_USER
;
2248 unsigned short macoff
, hdrlen
;
2249 unsigned int netoff
;
2250 struct sk_buff
*copy_skb
= NULL
;
2251 struct timespec64 ts
;
2253 bool is_drop_n_account
= false;
2254 unsigned int slot_id
= 0;
2255 int vnet_hdr_sz
= 0;
2257 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
2258 * We may add members to them until current aligned size without forcing
2259 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
2261 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h
.h2
)) != 32);
2262 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h
.h3
)) != 48);
2264 if (skb
->pkt_type
== PACKET_LOOPBACK
)
2267 sk
= pt
->af_packet_priv
;
2270 if (!net_eq(dev_net(dev
), sock_net(sk
)))
2273 if (dev_has_header(dev
)) {
2274 if (sk
->sk_type
!= SOCK_DGRAM
)
2275 skb_push(skb
, skb
->data
- skb_mac_header(skb
));
2276 else if (skb
->pkt_type
== PACKET_OUTGOING
) {
2277 /* Special case: outgoing packets have ll header at head */
2278 skb_pull(skb
, skb_network_offset(skb
));
2284 res
= run_filter(skb
, sk
, snaplen
);
2286 goto drop_n_restore
;
2288 /* If we are flooded, just give up */
2289 if (__packet_rcv_has_room(po
, skb
) == ROOM_NONE
) {
2290 atomic_inc(&po
->tp_drops
);
2291 goto drop_n_restore
;
2294 if (skb
->ip_summed
== CHECKSUM_PARTIAL
)
2295 status
|= TP_STATUS_CSUMNOTREADY
;
2296 else if (skb
->pkt_type
!= PACKET_OUTGOING
&&
2297 skb_csum_unnecessary(skb
))
2298 status
|= TP_STATUS_CSUM_VALID
;
2299 if (skb_is_gso(skb
) && skb_is_gso_tcp(skb
))
2300 status
|= TP_STATUS_GSO_TCP
;
2305 if (sk
->sk_type
== SOCK_DGRAM
) {
2306 macoff
= netoff
= TPACKET_ALIGN(po
->tp_hdrlen
) + 16 +
2309 unsigned int maclen
= skb_network_offset(skb
);
2310 netoff
= TPACKET_ALIGN(po
->tp_hdrlen
+
2311 (maclen
< 16 ? 16 : maclen
)) +
2313 vnet_hdr_sz
= READ_ONCE(po
->vnet_hdr_sz
);
2315 netoff
+= vnet_hdr_sz
;
2316 macoff
= netoff
- maclen
;
2318 if (netoff
> USHRT_MAX
) {
2319 atomic_inc(&po
->tp_drops
);
2320 goto drop_n_restore
;
2322 if (po
->tp_version
<= TPACKET_V2
) {
2323 if (macoff
+ snaplen
> po
->rx_ring
.frame_size
) {
2324 if (po
->copy_thresh
&&
2325 atomic_read(&sk
->sk_rmem_alloc
) < sk
->sk_rcvbuf
) {
2326 if (skb_shared(skb
)) {
2327 copy_skb
= skb_clone(skb
, GFP_ATOMIC
);
2329 copy_skb
= skb_get(skb
);
2330 skb_head
= skb
->data
;
2333 memset(&PACKET_SKB_CB(copy_skb
)->sa
.ll
, 0,
2334 sizeof(PACKET_SKB_CB(copy_skb
)->sa
.ll
));
2335 skb_set_owner_r(copy_skb
, sk
);
2338 snaplen
= po
->rx_ring
.frame_size
- macoff
;
2339 if ((int)snaplen
< 0) {
2344 } else if (unlikely(macoff
+ snaplen
>
2345 GET_PBDQC_FROM_RB(&po
->rx_ring
)->max_frame_len
)) {
2348 nval
= GET_PBDQC_FROM_RB(&po
->rx_ring
)->max_frame_len
- macoff
;
2349 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2350 snaplen
, nval
, macoff
);
2352 if (unlikely((int)snaplen
< 0)) {
2354 macoff
= GET_PBDQC_FROM_RB(&po
->rx_ring
)->max_frame_len
;
2358 spin_lock(&sk
->sk_receive_queue
.lock
);
2359 h
.raw
= packet_current_rx_frame(po
, skb
,
2360 TP_STATUS_KERNEL
, (macoff
+snaplen
));
2362 goto drop_n_account
;
2364 if (po
->tp_version
<= TPACKET_V2
) {
2365 slot_id
= po
->rx_ring
.head
;
2366 if (test_bit(slot_id
, po
->rx_ring
.rx_owner_map
))
2367 goto drop_n_account
;
2368 __set_bit(slot_id
, po
->rx_ring
.rx_owner_map
);
2372 virtio_net_hdr_from_skb(skb
, h
.raw
+ macoff
-
2373 sizeof(struct virtio_net_hdr
),
2374 vio_le(), true, 0)) {
2375 if (po
->tp_version
== TPACKET_V3
)
2376 prb_clear_blk_fill_status(&po
->rx_ring
);
2377 goto drop_n_account
;
2380 if (po
->tp_version
<= TPACKET_V2
) {
2381 packet_increment_rx_head(po
, &po
->rx_ring
);
2383 * LOSING will be reported till you read the stats,
2384 * because it's COR - Clear On Read.
2385 * Anyways, moving it for V1/V2 only as V3 doesn't need this
2388 if (atomic_read(&po
->tp_drops
))
2389 status
|= TP_STATUS_LOSING
;
2392 po
->stats
.stats1
.tp_packets
++;
2394 status
|= TP_STATUS_COPY
;
2395 skb_clear_delivery_time(copy_skb
);
2396 __skb_queue_tail(&sk
->sk_receive_queue
, copy_skb
);
2398 spin_unlock(&sk
->sk_receive_queue
.lock
);
2400 skb_copy_bits(skb
, 0, h
.raw
+ macoff
, snaplen
);
2402 /* Always timestamp; prefer an existing software timestamp taken
2403 * closer to the time of capture.
2405 ts_status
= tpacket_get_timestamp(skb
, &ts
,
2406 READ_ONCE(po
->tp_tstamp
) |
2407 SOF_TIMESTAMPING_SOFTWARE
);
2409 ktime_get_real_ts64(&ts
);
2411 status
|= ts_status
;
2413 switch (po
->tp_version
) {
2415 h
.h1
->tp_len
= skb
->len
;
2416 h
.h1
->tp_snaplen
= snaplen
;
2417 h
.h1
->tp_mac
= macoff
;
2418 h
.h1
->tp_net
= netoff
;
2419 h
.h1
->tp_sec
= ts
.tv_sec
;
2420 h
.h1
->tp_usec
= ts
.tv_nsec
/ NSEC_PER_USEC
;
2421 hdrlen
= sizeof(*h
.h1
);
2424 h
.h2
->tp_len
= skb
->len
;
2425 h
.h2
->tp_snaplen
= snaplen
;
2426 h
.h2
->tp_mac
= macoff
;
2427 h
.h2
->tp_net
= netoff
;
2428 h
.h2
->tp_sec
= ts
.tv_sec
;
2429 h
.h2
->tp_nsec
= ts
.tv_nsec
;
2430 if (skb_vlan_tag_present(skb
)) {
2431 h
.h2
->tp_vlan_tci
= skb_vlan_tag_get(skb
);
2432 h
.h2
->tp_vlan_tpid
= ntohs(skb
->vlan_proto
);
2433 status
|= TP_STATUS_VLAN_VALID
| TP_STATUS_VLAN_TPID_VALID
;
2435 h
.h2
->tp_vlan_tci
= 0;
2436 h
.h2
->tp_vlan_tpid
= 0;
2438 memset(h
.h2
->tp_padding
, 0, sizeof(h
.h2
->tp_padding
));
2439 hdrlen
= sizeof(*h
.h2
);
2442 /* tp_nxt_offset,vlan are already populated above.
2443 * So DONT clear those fields here
2445 h
.h3
->tp_status
|= status
;
2446 h
.h3
->tp_len
= skb
->len
;
2447 h
.h3
->tp_snaplen
= snaplen
;
2448 h
.h3
->tp_mac
= macoff
;
2449 h
.h3
->tp_net
= netoff
;
2450 h
.h3
->tp_sec
= ts
.tv_sec
;
2451 h
.h3
->tp_nsec
= ts
.tv_nsec
;
2452 memset(h
.h3
->tp_padding
, 0, sizeof(h
.h3
->tp_padding
));
2453 hdrlen
= sizeof(*h
.h3
);
2459 sll
= h
.raw
+ TPACKET_ALIGN(hdrlen
);
2460 sll
->sll_halen
= dev_parse_header(skb
, sll
->sll_addr
);
2461 sll
->sll_family
= AF_PACKET
;
2462 sll
->sll_hatype
= dev
->type
;
2463 sll
->sll_protocol
= skb
->protocol
;
2464 sll
->sll_pkttype
= skb
->pkt_type
;
2465 if (unlikely(packet_sock_flag(po
, PACKET_SOCK_ORIGDEV
)))
2466 sll
->sll_ifindex
= orig_dev
->ifindex
;
2468 sll
->sll_ifindex
= dev
->ifindex
;
2472 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
2473 if (po
->tp_version
<= TPACKET_V2
) {
2476 end
= (u8
*) PAGE_ALIGN((unsigned long) h
.raw
+
2479 for (start
= h
.raw
; start
< end
; start
+= PAGE_SIZE
)
2480 flush_dcache_page(pgv_to_page(start
));
2485 if (po
->tp_version
<= TPACKET_V2
) {
2486 spin_lock(&sk
->sk_receive_queue
.lock
);
2487 __packet_set_status(po
, h
.raw
, status
);
2488 __clear_bit(slot_id
, po
->rx_ring
.rx_owner_map
);
2489 spin_unlock(&sk
->sk_receive_queue
.lock
);
2490 sk
->sk_data_ready(sk
);
2491 } else if (po
->tp_version
== TPACKET_V3
) {
2492 prb_clear_blk_fill_status(&po
->rx_ring
);
2496 if (skb_head
!= skb
->data
&& skb_shared(skb
)) {
2497 skb
->data
= skb_head
;
2501 if (!is_drop_n_account
)
2508 spin_unlock(&sk
->sk_receive_queue
.lock
);
2509 atomic_inc(&po
->tp_drops
);
2510 is_drop_n_account
= true;
2512 sk
->sk_data_ready(sk
);
2513 kfree_skb(copy_skb
);
2514 goto drop_n_restore
;
2517 static void tpacket_destruct_skb(struct sk_buff
*skb
)
2519 struct packet_sock
*po
= pkt_sk(skb
->sk
);
2521 if (likely(po
->tx_ring
.pg_vec
)) {
2525 ph
= skb_zcopy_get_nouarg(skb
);
2526 packet_dec_pending(&po
->tx_ring
);
2528 ts
= __packet_set_timestamp(po
, ph
, skb
);
2529 __packet_set_status(po
, ph
, TP_STATUS_AVAILABLE
| ts
);
2531 if (!packet_read_pending(&po
->tx_ring
))
2532 complete(&po
->skb_completion
);
2538 static int __packet_snd_vnet_parse(struct virtio_net_hdr
*vnet_hdr
, size_t len
)
2540 if ((vnet_hdr
->flags
& VIRTIO_NET_HDR_F_NEEDS_CSUM
) &&
2541 (__virtio16_to_cpu(vio_le(), vnet_hdr
->csum_start
) +
2542 __virtio16_to_cpu(vio_le(), vnet_hdr
->csum_offset
) + 2 >
2543 __virtio16_to_cpu(vio_le(), vnet_hdr
->hdr_len
)))
2544 vnet_hdr
->hdr_len
= __cpu_to_virtio16(vio_le(),
2545 __virtio16_to_cpu(vio_le(), vnet_hdr
->csum_start
) +
2546 __virtio16_to_cpu(vio_le(), vnet_hdr
->csum_offset
) + 2);
2548 if (__virtio16_to_cpu(vio_le(), vnet_hdr
->hdr_len
) > len
)
2554 static int packet_snd_vnet_parse(struct msghdr
*msg
, size_t *len
,
2555 struct virtio_net_hdr
*vnet_hdr
, int vnet_hdr_sz
)
2559 if (*len
< vnet_hdr_sz
)
2561 *len
-= vnet_hdr_sz
;
2563 if (!copy_from_iter_full(vnet_hdr
, sizeof(*vnet_hdr
), &msg
->msg_iter
))
2566 ret
= __packet_snd_vnet_parse(vnet_hdr
, *len
);
2570 /* move iter to point to the start of mac header */
2571 if (vnet_hdr_sz
!= sizeof(struct virtio_net_hdr
))
2572 iov_iter_advance(&msg
->msg_iter
, vnet_hdr_sz
- sizeof(struct virtio_net_hdr
));
2577 static int tpacket_fill_skb(struct packet_sock
*po
, struct sk_buff
*skb
,
2578 void *frame
, struct net_device
*dev
, void *data
, int tp_len
,
2579 __be16 proto
, unsigned char *addr
, int hlen
, int copylen
,
2580 const struct sockcm_cookie
*sockc
)
2582 union tpacket_uhdr ph
;
2583 int to_write
, offset
, len
, nr_frags
, len_max
;
2584 struct socket
*sock
= po
->sk
.sk_socket
;
2590 skb
->protocol
= proto
;
2592 skb
->priority
= READ_ONCE(po
->sk
.sk_priority
);
2593 skb
->mark
= READ_ONCE(po
->sk
.sk_mark
);
2594 skb
->tstamp
= sockc
->transmit_time
;
2595 skb_setup_tx_timestamp(skb
, sockc
->tsflags
);
2596 skb_zcopy_set_nouarg(skb
, ph
.raw
);
2598 skb_reserve(skb
, hlen
);
2599 skb_reset_network_header(skb
);
2603 if (sock
->type
== SOCK_DGRAM
) {
2604 err
= dev_hard_header(skb
, dev
, ntohs(proto
), addr
,
2606 if (unlikely(err
< 0))
2608 } else if (copylen
) {
2609 int hdrlen
= min_t(int, copylen
, tp_len
);
2611 skb_push(skb
, dev
->hard_header_len
);
2612 skb_put(skb
, copylen
- dev
->hard_header_len
);
2613 err
= skb_store_bits(skb
, 0, data
, hdrlen
);
2616 if (!dev_validate_header(dev
, skb
->data
, hdrlen
))
2623 offset
= offset_in_page(data
);
2624 len_max
= PAGE_SIZE
- offset
;
2625 len
= ((to_write
> len_max
) ? len_max
: to_write
);
2627 skb
->data_len
= to_write
;
2628 skb
->len
+= to_write
;
2629 skb
->truesize
+= to_write
;
2630 refcount_add(to_write
, &po
->sk
.sk_wmem_alloc
);
2632 while (likely(to_write
)) {
2633 nr_frags
= skb_shinfo(skb
)->nr_frags
;
2635 if (unlikely(nr_frags
>= MAX_SKB_FRAGS
)) {
2636 pr_err("Packet exceed the number of skb frags(%u)\n",
2637 (unsigned int)MAX_SKB_FRAGS
);
2641 page
= pgv_to_page(data
);
2643 flush_dcache_page(page
);
2645 skb_fill_page_desc(skb
, nr_frags
, page
, offset
, len
);
2648 len_max
= PAGE_SIZE
;
2649 len
= ((to_write
> len_max
) ? len_max
: to_write
);
2652 packet_parse_headers(skb
, sock
);
2657 static int tpacket_parse_header(struct packet_sock
*po
, void *frame
,
2658 int size_max
, void **data
)
2660 union tpacket_uhdr ph
;
2665 switch (po
->tp_version
) {
2667 if (ph
.h3
->tp_next_offset
!= 0) {
2668 pr_warn_once("variable sized slot not supported");
2671 tp_len
= ph
.h3
->tp_len
;
2674 tp_len
= ph
.h2
->tp_len
;
2677 tp_len
= ph
.h1
->tp_len
;
2680 if (unlikely(tp_len
> size_max
)) {
2681 pr_err("packet size is too long (%d > %d)\n", tp_len
, size_max
);
2685 if (unlikely(packet_sock_flag(po
, PACKET_SOCK_TX_HAS_OFF
))) {
2686 int off_min
, off_max
;
2688 off_min
= po
->tp_hdrlen
- sizeof(struct sockaddr_ll
);
2689 off_max
= po
->tx_ring
.frame_size
- tp_len
;
2690 if (po
->sk
.sk_type
== SOCK_DGRAM
) {
2691 switch (po
->tp_version
) {
2693 off
= ph
.h3
->tp_net
;
2696 off
= ph
.h2
->tp_net
;
2699 off
= ph
.h1
->tp_net
;
2703 switch (po
->tp_version
) {
2705 off
= ph
.h3
->tp_mac
;
2708 off
= ph
.h2
->tp_mac
;
2711 off
= ph
.h1
->tp_mac
;
2715 if (unlikely((off
< off_min
) || (off_max
< off
)))
2718 off
= po
->tp_hdrlen
- sizeof(struct sockaddr_ll
);
2721 *data
= frame
+ off
;
2725 static int tpacket_snd(struct packet_sock
*po
, struct msghdr
*msg
)
2727 struct sk_buff
*skb
= NULL
;
2728 struct net_device
*dev
;
2729 struct virtio_net_hdr
*vnet_hdr
= NULL
;
2730 struct sockcm_cookie sockc
;
2732 int err
, reserve
= 0;
2734 DECLARE_SOCKADDR(struct sockaddr_ll
*, saddr
, msg
->msg_name
);
2735 bool need_wait
= !(msg
->msg_flags
& MSG_DONTWAIT
);
2736 int vnet_hdr_sz
= READ_ONCE(po
->vnet_hdr_sz
);
2737 unsigned char *addr
= NULL
;
2738 int tp_len
, size_max
;
2741 int status
= TP_STATUS_AVAILABLE
;
2742 int hlen
, tlen
, copylen
= 0;
2745 mutex_lock(&po
->pg_vec_lock
);
2747 /* packet_sendmsg() check on tx_ring.pg_vec was lockless,
2748 * we need to confirm it under protection of pg_vec_lock.
2750 if (unlikely(!po
->tx_ring
.pg_vec
)) {
2754 if (likely(saddr
== NULL
)) {
2755 dev
= packet_cached_dev_get(po
);
2756 proto
= READ_ONCE(po
->num
);
2759 if (msg
->msg_namelen
< sizeof(struct sockaddr_ll
))
2761 if (msg
->msg_namelen
< (saddr
->sll_halen
2762 + offsetof(struct sockaddr_ll
,
2765 proto
= saddr
->sll_protocol
;
2766 dev
= dev_get_by_index(sock_net(&po
->sk
), saddr
->sll_ifindex
);
2767 if (po
->sk
.sk_socket
->type
== SOCK_DGRAM
) {
2768 if (dev
&& msg
->msg_namelen
< dev
->addr_len
+
2769 offsetof(struct sockaddr_ll
, sll_addr
))
2771 addr
= saddr
->sll_addr
;
2776 if (unlikely(dev
== NULL
))
2779 if (unlikely(!(dev
->flags
& IFF_UP
)))
2782 sockcm_init(&sockc
, &po
->sk
);
2783 if (msg
->msg_controllen
) {
2784 err
= sock_cmsg_send(&po
->sk
, msg
, &sockc
);
2789 if (po
->sk
.sk_socket
->type
== SOCK_RAW
)
2790 reserve
= dev
->hard_header_len
;
2791 size_max
= po
->tx_ring
.frame_size
2792 - (po
->tp_hdrlen
- sizeof(struct sockaddr_ll
));
2794 if ((size_max
> dev
->mtu
+ reserve
+ VLAN_HLEN
) && !vnet_hdr_sz
)
2795 size_max
= dev
->mtu
+ reserve
+ VLAN_HLEN
;
2797 reinit_completion(&po
->skb_completion
);
2800 ph
= packet_current_frame(po
, &po
->tx_ring
,
2801 TP_STATUS_SEND_REQUEST
);
2802 if (unlikely(ph
== NULL
)) {
2803 if (need_wait
&& skb
) {
2804 timeo
= sock_sndtimeo(&po
->sk
, msg
->msg_flags
& MSG_DONTWAIT
);
2805 timeo
= wait_for_completion_interruptible_timeout(&po
->skb_completion
, timeo
);
2807 err
= !timeo
? -ETIMEDOUT
: -ERESTARTSYS
;
2811 /* check for additional frames */
2816 tp_len
= tpacket_parse_header(po
, ph
, size_max
, &data
);
2820 status
= TP_STATUS_SEND_REQUEST
;
2821 hlen
= LL_RESERVED_SPACE(dev
);
2822 tlen
= dev
->needed_tailroom
;
2825 data
+= vnet_hdr_sz
;
2826 tp_len
-= vnet_hdr_sz
;
2828 __packet_snd_vnet_parse(vnet_hdr
, tp_len
)) {
2832 copylen
= __virtio16_to_cpu(vio_le(),
2835 copylen
= max_t(int, copylen
, dev
->hard_header_len
);
2836 skb
= sock_alloc_send_skb(&po
->sk
,
2837 hlen
+ tlen
+ sizeof(struct sockaddr_ll
) +
2838 (copylen
- dev
->hard_header_len
),
2841 if (unlikely(skb
== NULL
)) {
2842 /* we assume the socket was initially writeable ... */
2843 if (likely(len_sum
> 0))
2847 tp_len
= tpacket_fill_skb(po
, skb
, ph
, dev
, data
, tp_len
, proto
,
2848 addr
, hlen
, copylen
, &sockc
);
2849 if (likely(tp_len
>= 0) &&
2850 tp_len
> dev
->mtu
+ reserve
&&
2852 !packet_extra_vlan_len_allowed(dev
, skb
))
2855 if (unlikely(tp_len
< 0)) {
2857 if (packet_sock_flag(po
, PACKET_SOCK_TP_LOSS
)) {
2858 __packet_set_status(po
, ph
,
2859 TP_STATUS_AVAILABLE
);
2860 packet_increment_head(&po
->tx_ring
);
2864 status
= TP_STATUS_WRONG_FORMAT
;
2871 if (virtio_net_hdr_to_skb(skb
, vnet_hdr
, vio_le())) {
2875 virtio_net_hdr_set_proto(skb
, vnet_hdr
);
2878 skb
->destructor
= tpacket_destruct_skb
;
2879 __packet_set_status(po
, ph
, TP_STATUS_SENDING
);
2880 packet_inc_pending(&po
->tx_ring
);
2882 status
= TP_STATUS_SEND_REQUEST
;
2883 err
= packet_xmit(po
, skb
);
2884 if (unlikely(err
!= 0)) {
2886 err
= net_xmit_errno(err
);
2887 if (err
&& __packet_get_status(po
, ph
) ==
2888 TP_STATUS_AVAILABLE
) {
2889 /* skb was destructed already */
2894 * skb was dropped but not destructed yet;
2895 * let's treat it like congestion or err < 0
2899 packet_increment_head(&po
->tx_ring
);
2901 } while (likely((ph
!= NULL
) ||
2902 /* Note: packet_read_pending() might be slow if we have
2903 * to call it as it's per_cpu variable, but in fast-path
2904 * we already short-circuit the loop with the first
2905 * condition, and luckily don't have to go that path
2908 (need_wait
&& packet_read_pending(&po
->tx_ring
))));
2914 __packet_set_status(po
, ph
, status
);
2919 mutex_unlock(&po
->pg_vec_lock
);
2923 static struct sk_buff
*packet_alloc_skb(struct sock
*sk
, size_t prepad
,
2924 size_t reserve
, size_t len
,
2925 size_t linear
, int noblock
,
2928 struct sk_buff
*skb
;
2930 /* Under a page? Don't bother with paged skb. */
2931 if (prepad
+ len
< PAGE_SIZE
|| !linear
)
2934 if (len
- linear
> MAX_SKB_FRAGS
* (PAGE_SIZE
<< PAGE_ALLOC_COSTLY_ORDER
))
2935 linear
= len
- MAX_SKB_FRAGS
* (PAGE_SIZE
<< PAGE_ALLOC_COSTLY_ORDER
);
2936 skb
= sock_alloc_send_pskb(sk
, prepad
+ linear
, len
- linear
, noblock
,
2937 err
, PAGE_ALLOC_COSTLY_ORDER
);
2941 skb_reserve(skb
, reserve
);
2942 skb_put(skb
, linear
);
2943 skb
->data_len
= len
- linear
;
2944 skb
->len
+= len
- linear
;
2949 static int packet_snd(struct socket
*sock
, struct msghdr
*msg
, size_t len
)
2951 struct sock
*sk
= sock
->sk
;
2952 DECLARE_SOCKADDR(struct sockaddr_ll
*, saddr
, msg
->msg_name
);
2953 struct sk_buff
*skb
;
2954 struct net_device
*dev
;
2956 unsigned char *addr
= NULL
;
2957 int err
, reserve
= 0;
2958 struct sockcm_cookie sockc
;
2959 struct virtio_net_hdr vnet_hdr
= { 0 };
2961 struct packet_sock
*po
= pkt_sk(sk
);
2962 int vnet_hdr_sz
= READ_ONCE(po
->vnet_hdr_sz
);
2963 int hlen
, tlen
, linear
;
2967 * Get and verify the address.
2970 if (likely(saddr
== NULL
)) {
2971 dev
= packet_cached_dev_get(po
);
2972 proto
= READ_ONCE(po
->num
);
2975 if (msg
->msg_namelen
< sizeof(struct sockaddr_ll
))
2977 if (msg
->msg_namelen
< (saddr
->sll_halen
+ offsetof(struct sockaddr_ll
, sll_addr
)))
2979 proto
= saddr
->sll_protocol
;
2980 dev
= dev_get_by_index(sock_net(sk
), saddr
->sll_ifindex
);
2981 if (sock
->type
== SOCK_DGRAM
) {
2982 if (dev
&& msg
->msg_namelen
< dev
->addr_len
+
2983 offsetof(struct sockaddr_ll
, sll_addr
))
2985 addr
= saddr
->sll_addr
;
2990 if (unlikely(dev
== NULL
))
2993 if (unlikely(!(dev
->flags
& IFF_UP
)))
2996 sockcm_init(&sockc
, sk
);
2997 sockc
.mark
= READ_ONCE(sk
->sk_mark
);
2998 if (msg
->msg_controllen
) {
2999 err
= sock_cmsg_send(sk
, msg
, &sockc
);
3004 if (sock
->type
== SOCK_RAW
)
3005 reserve
= dev
->hard_header_len
;
3007 err
= packet_snd_vnet_parse(msg
, &len
, &vnet_hdr
, vnet_hdr_sz
);
3012 if (unlikely(sock_flag(sk
, SOCK_NOFCS
))) {
3013 if (!netif_supports_nofcs(dev
)) {
3014 err
= -EPROTONOSUPPORT
;
3017 extra_len
= 4; /* We're doing our own CRC */
3021 if (!vnet_hdr
.gso_type
&&
3022 (len
> dev
->mtu
+ reserve
+ VLAN_HLEN
+ extra_len
))
3026 hlen
= LL_RESERVED_SPACE(dev
);
3027 tlen
= dev
->needed_tailroom
;
3028 linear
= __virtio16_to_cpu(vio_le(), vnet_hdr
.hdr_len
);
3029 linear
= max(linear
, min_t(int, len
, dev
->hard_header_len
));
3030 skb
= packet_alloc_skb(sk
, hlen
+ tlen
, hlen
, len
, linear
,
3031 msg
->msg_flags
& MSG_DONTWAIT
, &err
);
3035 skb_reset_network_header(skb
);
3038 if (sock
->type
== SOCK_DGRAM
) {
3039 offset
= dev_hard_header(skb
, dev
, ntohs(proto
), addr
, NULL
, len
);
3040 if (unlikely(offset
< 0))
3042 } else if (reserve
) {
3043 skb_reserve(skb
, -reserve
);
3044 if (len
< reserve
+ sizeof(struct ipv6hdr
) &&
3045 dev
->min_header_len
!= dev
->hard_header_len
)
3046 skb_reset_network_header(skb
);
3049 /* Returns -EFAULT on error */
3050 err
= skb_copy_datagram_from_iter(skb
, offset
, &msg
->msg_iter
, len
);
3054 if ((sock
->type
== SOCK_RAW
&&
3055 !dev_validate_header(dev
, skb
->data
, len
)) || !skb
->len
) {
3060 skb_setup_tx_timestamp(skb
, sockc
.tsflags
);
3062 if (!vnet_hdr
.gso_type
&& (len
> dev
->mtu
+ reserve
+ extra_len
) &&
3063 !packet_extra_vlan_len_allowed(dev
, skb
)) {
3068 skb
->protocol
= proto
;
3070 skb
->priority
= READ_ONCE(sk
->sk_priority
);
3071 skb
->mark
= sockc
.mark
;
3072 skb
->tstamp
= sockc
.transmit_time
;
3074 if (unlikely(extra_len
== 4))
3077 packet_parse_headers(skb
, sock
);
3080 err
= virtio_net_hdr_to_skb(skb
, &vnet_hdr
, vio_le());
3084 virtio_net_hdr_set_proto(skb
, &vnet_hdr
);
3087 err
= packet_xmit(po
, skb
);
3089 if (unlikely(err
!= 0)) {
3091 err
= net_xmit_errno(err
);
3108 static int packet_sendmsg(struct socket
*sock
, struct msghdr
*msg
, size_t len
)
3110 struct sock
*sk
= sock
->sk
;
3111 struct packet_sock
*po
= pkt_sk(sk
);
3113 /* Reading tx_ring.pg_vec without holding pg_vec_lock is racy.
3114 * tpacket_snd() will redo the check safely.
3116 if (data_race(po
->tx_ring
.pg_vec
))
3117 return tpacket_snd(po
, msg
);
3119 return packet_snd(sock
, msg
, len
);
3123 * Close a PACKET socket. This is fairly simple. We immediately go
3124 * to 'closed' state and remove our protocol entry in the device list.
3127 static int packet_release(struct socket
*sock
)
3129 struct sock
*sk
= sock
->sk
;
3130 struct packet_sock
*po
;
3131 struct packet_fanout
*f
;
3133 union tpacket_req_u req_u
;
3141 mutex_lock(&net
->packet
.sklist_lock
);
3142 sk_del_node_init_rcu(sk
);
3143 mutex_unlock(&net
->packet
.sklist_lock
);
3145 sock_prot_inuse_add(net
, sk
->sk_prot
, -1);
3147 spin_lock(&po
->bind_lock
);
3148 unregister_prot_hook(sk
, false);
3149 packet_cached_dev_reset(po
);
3151 if (po
->prot_hook
.dev
) {
3152 netdev_put(po
->prot_hook
.dev
, &po
->prot_hook
.dev_tracker
);
3153 po
->prot_hook
.dev
= NULL
;
3155 spin_unlock(&po
->bind_lock
);
3157 packet_flush_mclist(sk
);
3160 if (po
->rx_ring
.pg_vec
) {
3161 memset(&req_u
, 0, sizeof(req_u
));
3162 packet_set_ring(sk
, &req_u
, 1, 0);
3165 if (po
->tx_ring
.pg_vec
) {
3166 memset(&req_u
, 0, sizeof(req_u
));
3167 packet_set_ring(sk
, &req_u
, 1, 1);
3171 f
= fanout_release(sk
);
3175 kfree(po
->rollover
);
3177 fanout_release_data(f
);
3181 * Now the socket is dead. No more input will appear.
3188 skb_queue_purge(&sk
->sk_receive_queue
);
3189 packet_free_pending(po
);
3196 * Attach a packet hook.
3199 static int packet_do_bind(struct sock
*sk
, const char *name
, int ifindex
,
3202 struct packet_sock
*po
= pkt_sk(sk
);
3203 struct net_device
*dev
= NULL
;
3204 bool unlisted
= false;
3209 spin_lock(&po
->bind_lock
);
3221 dev
= dev_get_by_name_rcu(sock_net(sk
), name
);
3226 } else if (ifindex
) {
3227 dev
= dev_get_by_index_rcu(sock_net(sk
), ifindex
);
3234 need_rehook
= po
->prot_hook
.type
!= proto
|| po
->prot_hook
.dev
!= dev
;
3238 if (packet_sock_flag(po
, PACKET_SOCK_RUNNING
)) {
3240 /* prevents packet_notifier() from calling
3241 * register_prot_hook()
3243 WRITE_ONCE(po
->num
, 0);
3244 __unregister_prot_hook(sk
, true);
3247 unlisted
= !dev_get_by_index_rcu(sock_net(sk
),
3251 BUG_ON(packet_sock_flag(po
, PACKET_SOCK_RUNNING
));
3252 WRITE_ONCE(po
->num
, proto
);
3253 po
->prot_hook
.type
= proto
;
3255 netdev_put(po
->prot_hook
.dev
, &po
->prot_hook
.dev_tracker
);
3257 if (unlikely(unlisted
)) {
3258 po
->prot_hook
.dev
= NULL
;
3259 WRITE_ONCE(po
->ifindex
, -1);
3260 packet_cached_dev_reset(po
);
3262 netdev_hold(dev
, &po
->prot_hook
.dev_tracker
,
3264 po
->prot_hook
.dev
= dev
;
3265 WRITE_ONCE(po
->ifindex
, dev
? dev
->ifindex
: 0);
3266 packet_cached_dev_assign(po
, dev
);
3271 if (proto
== 0 || !need_rehook
)
3274 if (!unlisted
&& (!dev
|| (dev
->flags
& IFF_UP
))) {
3275 register_prot_hook(sk
);
3277 sk
->sk_err
= ENETDOWN
;
3278 if (!sock_flag(sk
, SOCK_DEAD
))
3279 sk_error_report(sk
);
3284 spin_unlock(&po
->bind_lock
);
3290 * Bind a packet socket to a device
3293 static int packet_bind_spkt(struct socket
*sock
, struct sockaddr
*uaddr
,
3296 struct sock
*sk
= sock
->sk
;
3297 char name
[sizeof(uaddr
->sa_data_min
) + 1];
3303 if (addr_len
!= sizeof(struct sockaddr
))
3305 /* uaddr->sa_data comes from the userspace, it's not guaranteed to be
3308 memcpy(name
, uaddr
->sa_data
, sizeof(uaddr
->sa_data_min
));
3309 name
[sizeof(uaddr
->sa_data_min
)] = 0;
3311 return packet_do_bind(sk
, name
, 0, 0);
3314 static int packet_bind(struct socket
*sock
, struct sockaddr
*uaddr
, int addr_len
)
3316 struct sockaddr_ll
*sll
= (struct sockaddr_ll
*)uaddr
;
3317 struct sock
*sk
= sock
->sk
;
3323 if (addr_len
< sizeof(struct sockaddr_ll
))
3325 if (sll
->sll_family
!= AF_PACKET
)
3328 return packet_do_bind(sk
, NULL
, sll
->sll_ifindex
, sll
->sll_protocol
);
3331 static struct proto packet_proto
= {
3333 .owner
= THIS_MODULE
,
3334 .obj_size
= sizeof(struct packet_sock
),
3338 * Create a packet of type SOCK_PACKET.
3341 static int packet_create(struct net
*net
, struct socket
*sock
, int protocol
,
3345 struct packet_sock
*po
;
3346 __be16 proto
= (__force __be16
)protocol
; /* weird, but documented */
3349 if (!ns_capable(net
->user_ns
, CAP_NET_RAW
))
3351 if (sock
->type
!= SOCK_DGRAM
&& sock
->type
!= SOCK_RAW
&&
3352 sock
->type
!= SOCK_PACKET
)
3353 return -ESOCKTNOSUPPORT
;
3355 sock
->state
= SS_UNCONNECTED
;
3358 sk
= sk_alloc(net
, PF_PACKET
, GFP_KERNEL
, &packet_proto
, kern
);
3362 sock
->ops
= &packet_ops
;
3363 if (sock
->type
== SOCK_PACKET
)
3364 sock
->ops
= &packet_ops_spkt
;
3366 sock_init_data(sock
, sk
);
3369 init_completion(&po
->skb_completion
);
3370 sk
->sk_family
= PF_PACKET
;
3373 err
= packet_alloc_pending(po
);
3377 packet_cached_dev_reset(po
);
3379 sk
->sk_destruct
= packet_sock_destruct
;
3382 * Attach a protocol block
3385 spin_lock_init(&po
->bind_lock
);
3386 mutex_init(&po
->pg_vec_lock
);
3387 po
->rollover
= NULL
;
3388 po
->prot_hook
.func
= packet_rcv
;
3390 if (sock
->type
== SOCK_PACKET
)
3391 po
->prot_hook
.func
= packet_rcv_spkt
;
3393 po
->prot_hook
.af_packet_priv
= sk
;
3394 po
->prot_hook
.af_packet_net
= sock_net(sk
);
3397 po
->prot_hook
.type
= proto
;
3398 __register_prot_hook(sk
);
3401 mutex_lock(&net
->packet
.sklist_lock
);
3402 sk_add_node_tail_rcu(sk
, &net
->packet
.sklist
);
3403 mutex_unlock(&net
->packet
.sklist_lock
);
3405 sock_prot_inuse_add(net
, &packet_proto
, 1);
3415 * Pull a packet from our receive queue and hand it to the user.
3416 * If necessary we block.
3419 static int packet_recvmsg(struct socket
*sock
, struct msghdr
*msg
, size_t len
,
3422 struct sock
*sk
= sock
->sk
;
3423 struct sk_buff
*skb
;
3425 int vnet_hdr_len
= READ_ONCE(pkt_sk(sk
)->vnet_hdr_sz
);
3426 unsigned int origlen
= 0;
3429 if (flags
& ~(MSG_PEEK
|MSG_DONTWAIT
|MSG_TRUNC
|MSG_CMSG_COMPAT
|MSG_ERRQUEUE
))
3433 /* What error should we return now? EUNATTACH? */
3434 if (pkt_sk(sk
)->ifindex
< 0)
3438 if (flags
& MSG_ERRQUEUE
) {
3439 err
= sock_recv_errqueue(sk
, msg
, len
,
3440 SOL_PACKET
, PACKET_TX_TIMESTAMP
);
3445 * Call the generic datagram receiver. This handles all sorts
3446 * of horrible races and re-entrancy so we can forget about it
3447 * in the protocol layers.
3449 * Now it will return ENETDOWN, if device have just gone down,
3450 * but then it will block.
3453 skb
= skb_recv_datagram(sk
, flags
, &err
);
3456 * An error occurred so return it. Because skb_recv_datagram()
3457 * handles the blocking we don't see and worry about blocking
3464 packet_rcv_try_clear_pressure(pkt_sk(sk
));
3467 err
= packet_rcv_vnet(msg
, skb
, &len
, vnet_hdr_len
);
3472 /* You lose any data beyond the buffer you gave. If it worries
3473 * a user program they can ask the device for its MTU
3479 msg
->msg_flags
|= MSG_TRUNC
;
3482 err
= skb_copy_datagram_msg(skb
, 0, msg
, copied
);
3486 if (sock
->type
!= SOCK_PACKET
) {
3487 struct sockaddr_ll
*sll
= &PACKET_SKB_CB(skb
)->sa
.ll
;
3489 /* Original length was stored in sockaddr_ll fields */
3490 origlen
= PACKET_SKB_CB(skb
)->sa
.origlen
;
3491 sll
->sll_family
= AF_PACKET
;
3492 sll
->sll_protocol
= skb
->protocol
;
3495 sock_recv_cmsgs(msg
, sk
, skb
);
3497 if (msg
->msg_name
) {
3498 const size_t max_len
= min(sizeof(skb
->cb
),
3499 sizeof(struct sockaddr_storage
));
3502 /* If the address length field is there to be filled
3503 * in, we fill it in now.
3505 if (sock
->type
== SOCK_PACKET
) {
3506 __sockaddr_check_size(sizeof(struct sockaddr_pkt
));
3507 msg
->msg_namelen
= sizeof(struct sockaddr_pkt
);
3508 copy_len
= msg
->msg_namelen
;
3510 struct sockaddr_ll
*sll
= &PACKET_SKB_CB(skb
)->sa
.ll
;
3512 msg
->msg_namelen
= sll
->sll_halen
+
3513 offsetof(struct sockaddr_ll
, sll_addr
);
3514 copy_len
= msg
->msg_namelen
;
3515 if (msg
->msg_namelen
< sizeof(struct sockaddr_ll
)) {
3516 memset(msg
->msg_name
+
3517 offsetof(struct sockaddr_ll
, sll_addr
),
3518 0, sizeof(sll
->sll_addr
));
3519 msg
->msg_namelen
= sizeof(struct sockaddr_ll
);
3522 if (WARN_ON_ONCE(copy_len
> max_len
)) {
3524 msg
->msg_namelen
= copy_len
;
3526 memcpy(msg
->msg_name
, &PACKET_SKB_CB(skb
)->sa
, copy_len
);
3529 if (packet_sock_flag(pkt_sk(sk
), PACKET_SOCK_AUXDATA
)) {
3530 struct tpacket_auxdata aux
;
3532 aux
.tp_status
= TP_STATUS_USER
;
3533 if (skb
->ip_summed
== CHECKSUM_PARTIAL
)
3534 aux
.tp_status
|= TP_STATUS_CSUMNOTREADY
;
3535 else if (skb
->pkt_type
!= PACKET_OUTGOING
&&
3536 skb_csum_unnecessary(skb
))
3537 aux
.tp_status
|= TP_STATUS_CSUM_VALID
;
3538 if (skb_is_gso(skb
) && skb_is_gso_tcp(skb
))
3539 aux
.tp_status
|= TP_STATUS_GSO_TCP
;
3541 aux
.tp_len
= origlen
;
3542 aux
.tp_snaplen
= skb
->len
;
3544 aux
.tp_net
= skb_network_offset(skb
);
3545 if (skb_vlan_tag_present(skb
)) {
3546 aux
.tp_vlan_tci
= skb_vlan_tag_get(skb
);
3547 aux
.tp_vlan_tpid
= ntohs(skb
->vlan_proto
);
3548 aux
.tp_status
|= TP_STATUS_VLAN_VALID
| TP_STATUS_VLAN_TPID_VALID
;
3550 aux
.tp_vlan_tci
= 0;
3551 aux
.tp_vlan_tpid
= 0;
3553 put_cmsg(msg
, SOL_PACKET
, PACKET_AUXDATA
, sizeof(aux
), &aux
);
3557 * Free or return the buffer as appropriate. Again this
3558 * hides all the races and re-entrancy issues from us.
3560 err
= vnet_hdr_len
+ ((flags
&MSG_TRUNC
) ? skb
->len
: copied
);
3563 skb_free_datagram(sk
, skb
);
3568 static int packet_getname_spkt(struct socket
*sock
, struct sockaddr
*uaddr
,
3571 struct net_device
*dev
;
3572 struct sock
*sk
= sock
->sk
;
3577 uaddr
->sa_family
= AF_PACKET
;
3578 memset(uaddr
->sa_data
, 0, sizeof(uaddr
->sa_data_min
));
3580 dev
= dev_get_by_index_rcu(sock_net(sk
), READ_ONCE(pkt_sk(sk
)->ifindex
));
3582 strscpy(uaddr
->sa_data
, dev
->name
, sizeof(uaddr
->sa_data_min
));
3585 return sizeof(*uaddr
);
3588 static int packet_getname(struct socket
*sock
, struct sockaddr
*uaddr
,
3591 struct net_device
*dev
;
3592 struct sock
*sk
= sock
->sk
;
3593 struct packet_sock
*po
= pkt_sk(sk
);
3594 DECLARE_SOCKADDR(struct sockaddr_ll
*, sll
, uaddr
);
3600 ifindex
= READ_ONCE(po
->ifindex
);
3601 sll
->sll_family
= AF_PACKET
;
3602 sll
->sll_ifindex
= ifindex
;
3603 sll
->sll_protocol
= READ_ONCE(po
->num
);
3604 sll
->sll_pkttype
= 0;
3606 dev
= dev_get_by_index_rcu(sock_net(sk
), ifindex
);
3608 sll
->sll_hatype
= dev
->type
;
3609 sll
->sll_halen
= dev
->addr_len
;
3611 /* Let __fortify_memcpy_chk() know the actual buffer size. */
3612 memcpy(((struct sockaddr_storage
*)sll
)->__data
+
3613 offsetof(struct sockaddr_ll
, sll_addr
) -
3614 offsetofend(struct sockaddr_ll
, sll_family
),
3615 dev
->dev_addr
, dev
->addr_len
);
3617 sll
->sll_hatype
= 0; /* Bad: we have no ARPHRD_UNSPEC */
3622 return offsetof(struct sockaddr_ll
, sll_addr
) + sll
->sll_halen
;
3625 static int packet_dev_mc(struct net_device
*dev
, struct packet_mclist
*i
,
3629 case PACKET_MR_MULTICAST
:
3630 if (i
->alen
!= dev
->addr_len
)
3633 return dev_mc_add(dev
, i
->addr
);
3635 return dev_mc_del(dev
, i
->addr
);
3637 case PACKET_MR_PROMISC
:
3638 return dev_set_promiscuity(dev
, what
);
3639 case PACKET_MR_ALLMULTI
:
3640 return dev_set_allmulti(dev
, what
);
3641 case PACKET_MR_UNICAST
:
3642 if (i
->alen
!= dev
->addr_len
)
3645 return dev_uc_add(dev
, i
->addr
);
3647 return dev_uc_del(dev
, i
->addr
);
3655 static void packet_dev_mclist_delete(struct net_device
*dev
,
3656 struct packet_mclist
**mlp
)
3658 struct packet_mclist
*ml
;
3660 while ((ml
= *mlp
) != NULL
) {
3661 if (ml
->ifindex
== dev
->ifindex
) {
3662 packet_dev_mc(dev
, ml
, -1);
3670 static int packet_mc_add(struct sock
*sk
, struct packet_mreq_max
*mreq
)
3672 struct packet_sock
*po
= pkt_sk(sk
);
3673 struct packet_mclist
*ml
, *i
;
3674 struct net_device
*dev
;
3680 dev
= __dev_get_by_index(sock_net(sk
), mreq
->mr_ifindex
);
3685 if (mreq
->mr_alen
> dev
->addr_len
)
3689 i
= kmalloc(sizeof(*i
), GFP_KERNEL
);
3694 for (ml
= po
->mclist
; ml
; ml
= ml
->next
) {
3695 if (ml
->ifindex
== mreq
->mr_ifindex
&&
3696 ml
->type
== mreq
->mr_type
&&
3697 ml
->alen
== mreq
->mr_alen
&&
3698 memcmp(ml
->addr
, mreq
->mr_address
, ml
->alen
) == 0) {
3700 /* Free the new element ... */
3706 i
->type
= mreq
->mr_type
;
3707 i
->ifindex
= mreq
->mr_ifindex
;
3708 i
->alen
= mreq
->mr_alen
;
3709 memcpy(i
->addr
, mreq
->mr_address
, i
->alen
);
3710 memset(i
->addr
+ i
->alen
, 0, sizeof(i
->addr
) - i
->alen
);
3712 i
->next
= po
->mclist
;
3714 err
= packet_dev_mc(dev
, i
, 1);
3716 po
->mclist
= i
->next
;
3725 static int packet_mc_drop(struct sock
*sk
, struct packet_mreq_max
*mreq
)
3727 struct packet_mclist
*ml
, **mlp
;
3731 for (mlp
= &pkt_sk(sk
)->mclist
; (ml
= *mlp
) != NULL
; mlp
= &ml
->next
) {
3732 if (ml
->ifindex
== mreq
->mr_ifindex
&&
3733 ml
->type
== mreq
->mr_type
&&
3734 ml
->alen
== mreq
->mr_alen
&&
3735 memcmp(ml
->addr
, mreq
->mr_address
, ml
->alen
) == 0) {
3736 if (--ml
->count
== 0) {
3737 struct net_device
*dev
;
3739 dev
= __dev_get_by_index(sock_net(sk
), ml
->ifindex
);
3741 packet_dev_mc(dev
, ml
, -1);
3751 static void packet_flush_mclist(struct sock
*sk
)
3753 struct packet_sock
*po
= pkt_sk(sk
);
3754 struct packet_mclist
*ml
;
3760 while ((ml
= po
->mclist
) != NULL
) {
3761 struct net_device
*dev
;
3763 po
->mclist
= ml
->next
;
3764 dev
= __dev_get_by_index(sock_net(sk
), ml
->ifindex
);
3766 packet_dev_mc(dev
, ml
, -1);
3773 packet_setsockopt(struct socket
*sock
, int level
, int optname
, sockptr_t optval
,
3774 unsigned int optlen
)
3776 struct sock
*sk
= sock
->sk
;
3777 struct packet_sock
*po
= pkt_sk(sk
);
3780 if (level
!= SOL_PACKET
)
3781 return -ENOPROTOOPT
;
3784 case PACKET_ADD_MEMBERSHIP
:
3785 case PACKET_DROP_MEMBERSHIP
:
3787 struct packet_mreq_max mreq
;
3789 memset(&mreq
, 0, sizeof(mreq
));
3790 if (len
< sizeof(struct packet_mreq
))
3792 if (len
> sizeof(mreq
))
3794 if (copy_from_sockptr(&mreq
, optval
, len
))
3796 if (len
< (mreq
.mr_alen
+ offsetof(struct packet_mreq
, mr_address
)))
3798 if (optname
== PACKET_ADD_MEMBERSHIP
)
3799 ret
= packet_mc_add(sk
, &mreq
);
3801 ret
= packet_mc_drop(sk
, &mreq
);
3805 case PACKET_RX_RING
:
3806 case PACKET_TX_RING
:
3808 union tpacket_req_u req_u
;
3812 switch (po
->tp_version
) {
3815 len
= sizeof(req_u
.req
);
3819 len
= sizeof(req_u
.req3
);
3825 if (copy_from_sockptr(&req_u
.req
, optval
, len
))
3828 ret
= packet_set_ring(sk
, &req_u
, 0,
3829 optname
== PACKET_TX_RING
);
3834 case PACKET_COPY_THRESH
:
3838 if (optlen
!= sizeof(val
))
3840 if (copy_from_sockptr(&val
, optval
, sizeof(val
)))
3843 pkt_sk(sk
)->copy_thresh
= val
;
3846 case PACKET_VERSION
:
3850 if (optlen
!= sizeof(val
))
3852 if (copy_from_sockptr(&val
, optval
, sizeof(val
)))
3863 if (po
->rx_ring
.pg_vec
|| po
->tx_ring
.pg_vec
) {
3866 po
->tp_version
= val
;
3872 case PACKET_RESERVE
:
3876 if (optlen
!= sizeof(val
))
3878 if (copy_from_sockptr(&val
, optval
, sizeof(val
)))
3883 if (po
->rx_ring
.pg_vec
|| po
->tx_ring
.pg_vec
) {
3886 po
->tp_reserve
= val
;
3896 if (optlen
!= sizeof(val
))
3898 if (copy_from_sockptr(&val
, optval
, sizeof(val
)))
3902 if (po
->rx_ring
.pg_vec
|| po
->tx_ring
.pg_vec
) {
3905 packet_sock_flag_set(po
, PACKET_SOCK_TP_LOSS
, val
);
3911 case PACKET_AUXDATA
:
3915 if (optlen
< sizeof(val
))
3917 if (copy_from_sockptr(&val
, optval
, sizeof(val
)))
3920 packet_sock_flag_set(po
, PACKET_SOCK_AUXDATA
, val
);
3923 case PACKET_ORIGDEV
:
3927 if (optlen
< sizeof(val
))
3929 if (copy_from_sockptr(&val
, optval
, sizeof(val
)))
3932 packet_sock_flag_set(po
, PACKET_SOCK_ORIGDEV
, val
);
3935 case PACKET_VNET_HDR
:
3936 case PACKET_VNET_HDR_SZ
:
3940 if (sock
->type
!= SOCK_RAW
)
3942 if (optlen
< sizeof(val
))
3944 if (copy_from_sockptr(&val
, optval
, sizeof(val
)))
3947 if (optname
== PACKET_VNET_HDR_SZ
) {
3948 if (val
&& val
!= sizeof(struct virtio_net_hdr
) &&
3949 val
!= sizeof(struct virtio_net_hdr_mrg_rxbuf
))
3953 hdr_len
= val
? sizeof(struct virtio_net_hdr
) : 0;
3956 if (po
->rx_ring
.pg_vec
|| po
->tx_ring
.pg_vec
) {
3959 WRITE_ONCE(po
->vnet_hdr_sz
, hdr_len
);
3965 case PACKET_TIMESTAMP
:
3969 if (optlen
!= sizeof(val
))
3971 if (copy_from_sockptr(&val
, optval
, sizeof(val
)))
3974 WRITE_ONCE(po
->tp_tstamp
, val
);
3979 struct fanout_args args
= { 0 };
3981 if (optlen
!= sizeof(int) && optlen
!= sizeof(args
))
3983 if (copy_from_sockptr(&args
, optval
, optlen
))
3986 return fanout_add(sk
, &args
);
3988 case PACKET_FANOUT_DATA
:
3990 /* Paired with the WRITE_ONCE() in fanout_add() */
3991 if (!READ_ONCE(po
->fanout
))
3994 return fanout_set_data(po
, optval
, optlen
);
3996 case PACKET_IGNORE_OUTGOING
:
4000 if (optlen
!= sizeof(val
))
4002 if (copy_from_sockptr(&val
, optval
, sizeof(val
)))
4004 if (val
< 0 || val
> 1)
4007 po
->prot_hook
.ignore_outgoing
= !!val
;
4010 case PACKET_TX_HAS_OFF
:
4014 if (optlen
!= sizeof(val
))
4016 if (copy_from_sockptr(&val
, optval
, sizeof(val
)))
4020 if (!po
->rx_ring
.pg_vec
&& !po
->tx_ring
.pg_vec
)
4021 packet_sock_flag_set(po
, PACKET_SOCK_TX_HAS_OFF
, val
);
4026 case PACKET_QDISC_BYPASS
:
4030 if (optlen
!= sizeof(val
))
4032 if (copy_from_sockptr(&val
, optval
, sizeof(val
)))
4035 packet_sock_flag_set(po
, PACKET_SOCK_QDISC_BYPASS
, val
);
4039 return -ENOPROTOOPT
;
4043 static int packet_getsockopt(struct socket
*sock
, int level
, int optname
,
4044 char __user
*optval
, int __user
*optlen
)
4047 int val
, lv
= sizeof(val
);
4048 struct sock
*sk
= sock
->sk
;
4049 struct packet_sock
*po
= pkt_sk(sk
);
4051 union tpacket_stats_u st
;
4052 struct tpacket_rollover_stats rstats
;
4055 if (level
!= SOL_PACKET
)
4056 return -ENOPROTOOPT
;
4058 if (get_user(len
, optlen
))
4065 case PACKET_STATISTICS
:
4066 spin_lock_bh(&sk
->sk_receive_queue
.lock
);
4067 memcpy(&st
, &po
->stats
, sizeof(st
));
4068 memset(&po
->stats
, 0, sizeof(po
->stats
));
4069 spin_unlock_bh(&sk
->sk_receive_queue
.lock
);
4070 drops
= atomic_xchg(&po
->tp_drops
, 0);
4072 if (po
->tp_version
== TPACKET_V3
) {
4073 lv
= sizeof(struct tpacket_stats_v3
);
4074 st
.stats3
.tp_drops
= drops
;
4075 st
.stats3
.tp_packets
+= drops
;
4078 lv
= sizeof(struct tpacket_stats
);
4079 st
.stats1
.tp_drops
= drops
;
4080 st
.stats1
.tp_packets
+= drops
;
4085 case PACKET_AUXDATA
:
4086 val
= packet_sock_flag(po
, PACKET_SOCK_AUXDATA
);
4088 case PACKET_ORIGDEV
:
4089 val
= packet_sock_flag(po
, PACKET_SOCK_ORIGDEV
);
4091 case PACKET_VNET_HDR
:
4092 val
= !!READ_ONCE(po
->vnet_hdr_sz
);
4094 case PACKET_VNET_HDR_SZ
:
4095 val
= READ_ONCE(po
->vnet_hdr_sz
);
4097 case PACKET_VERSION
:
4098 val
= po
->tp_version
;
4101 if (len
> sizeof(int))
4103 if (len
< sizeof(int))
4105 if (copy_from_user(&val
, optval
, len
))
4109 val
= sizeof(struct tpacket_hdr
);
4112 val
= sizeof(struct tpacket2_hdr
);
4115 val
= sizeof(struct tpacket3_hdr
);
4121 case PACKET_RESERVE
:
4122 val
= po
->tp_reserve
;
4125 val
= packet_sock_flag(po
, PACKET_SOCK_TP_LOSS
);
4127 case PACKET_TIMESTAMP
:
4128 val
= READ_ONCE(po
->tp_tstamp
);
4132 ((u32
)po
->fanout
->id
|
4133 ((u32
)po
->fanout
->type
<< 16) |
4134 ((u32
)po
->fanout
->flags
<< 24)) :
4137 case PACKET_IGNORE_OUTGOING
:
4138 val
= po
->prot_hook
.ignore_outgoing
;
4140 case PACKET_ROLLOVER_STATS
:
4143 rstats
.tp_all
= atomic_long_read(&po
->rollover
->num
);
4144 rstats
.tp_huge
= atomic_long_read(&po
->rollover
->num_huge
);
4145 rstats
.tp_failed
= atomic_long_read(&po
->rollover
->num_failed
);
4147 lv
= sizeof(rstats
);
4149 case PACKET_TX_HAS_OFF
:
4150 val
= packet_sock_flag(po
, PACKET_SOCK_TX_HAS_OFF
);
4152 case PACKET_QDISC_BYPASS
:
4153 val
= packet_sock_flag(po
, PACKET_SOCK_QDISC_BYPASS
);
4156 return -ENOPROTOOPT
;
4161 if (put_user(len
, optlen
))
4163 if (copy_to_user(optval
, data
, len
))
4168 static int packet_notifier(struct notifier_block
*this,
4169 unsigned long msg
, void *ptr
)
4172 struct net_device
*dev
= netdev_notifier_info_to_dev(ptr
);
4173 struct net
*net
= dev_net(dev
);
4176 sk_for_each_rcu(sk
, &net
->packet
.sklist
) {
4177 struct packet_sock
*po
= pkt_sk(sk
);
4180 case NETDEV_UNREGISTER
:
4182 packet_dev_mclist_delete(dev
, &po
->mclist
);
4186 if (dev
->ifindex
== po
->ifindex
) {
4187 spin_lock(&po
->bind_lock
);
4188 if (packet_sock_flag(po
, PACKET_SOCK_RUNNING
)) {
4189 __unregister_prot_hook(sk
, false);
4190 sk
->sk_err
= ENETDOWN
;
4191 if (!sock_flag(sk
, SOCK_DEAD
))
4192 sk_error_report(sk
);
4194 if (msg
== NETDEV_UNREGISTER
) {
4195 packet_cached_dev_reset(po
);
4196 WRITE_ONCE(po
->ifindex
, -1);
4197 netdev_put(po
->prot_hook
.dev
,
4198 &po
->prot_hook
.dev_tracker
);
4199 po
->prot_hook
.dev
= NULL
;
4201 spin_unlock(&po
->bind_lock
);
4205 if (dev
->ifindex
== po
->ifindex
) {
4206 spin_lock(&po
->bind_lock
);
4208 register_prot_hook(sk
);
4209 spin_unlock(&po
->bind_lock
);
4219 static int packet_ioctl(struct socket
*sock
, unsigned int cmd
,
4222 struct sock
*sk
= sock
->sk
;
4227 int amount
= sk_wmem_alloc_get(sk
);
4229 return put_user(amount
, (int __user
*)arg
);
4233 struct sk_buff
*skb
;
4236 spin_lock_bh(&sk
->sk_receive_queue
.lock
);
4237 skb
= skb_peek(&sk
->sk_receive_queue
);
4240 spin_unlock_bh(&sk
->sk_receive_queue
.lock
);
4241 return put_user(amount
, (int __user
*)arg
);
4251 case SIOCGIFBRDADDR
:
4252 case SIOCSIFBRDADDR
:
4253 case SIOCGIFNETMASK
:
4254 case SIOCSIFNETMASK
:
4255 case SIOCGIFDSTADDR
:
4256 case SIOCSIFDSTADDR
:
4258 return inet_dgram_ops
.ioctl(sock
, cmd
, arg
);
4262 return -ENOIOCTLCMD
;
4267 static __poll_t
packet_poll(struct file
*file
, struct socket
*sock
,
4270 struct sock
*sk
= sock
->sk
;
4271 struct packet_sock
*po
= pkt_sk(sk
);
4272 __poll_t mask
= datagram_poll(file
, sock
, wait
);
4274 spin_lock_bh(&sk
->sk_receive_queue
.lock
);
4275 if (po
->rx_ring
.pg_vec
) {
4276 if (!packet_previous_rx_frame(po
, &po
->rx_ring
,
4278 mask
|= EPOLLIN
| EPOLLRDNORM
;
4280 packet_rcv_try_clear_pressure(po
);
4281 spin_unlock_bh(&sk
->sk_receive_queue
.lock
);
4282 spin_lock_bh(&sk
->sk_write_queue
.lock
);
4283 if (po
->tx_ring
.pg_vec
) {
4284 if (packet_current_frame(po
, &po
->tx_ring
, TP_STATUS_AVAILABLE
))
4285 mask
|= EPOLLOUT
| EPOLLWRNORM
;
4287 spin_unlock_bh(&sk
->sk_write_queue
.lock
);
4292 /* Dirty? Well, I still did not learn better way to account
4296 static void packet_mm_open(struct vm_area_struct
*vma
)
4298 struct file
*file
= vma
->vm_file
;
4299 struct socket
*sock
= file
->private_data
;
4300 struct sock
*sk
= sock
->sk
;
4303 atomic_inc(&pkt_sk(sk
)->mapped
);
4306 static void packet_mm_close(struct vm_area_struct
*vma
)
4308 struct file
*file
= vma
->vm_file
;
4309 struct socket
*sock
= file
->private_data
;
4310 struct sock
*sk
= sock
->sk
;
4313 atomic_dec(&pkt_sk(sk
)->mapped
);
4316 static const struct vm_operations_struct packet_mmap_ops
= {
4317 .open
= packet_mm_open
,
4318 .close
= packet_mm_close
,
4321 static void free_pg_vec(struct pgv
*pg_vec
, unsigned int order
,
4326 for (i
= 0; i
< len
; i
++) {
4327 if (likely(pg_vec
[i
].buffer
)) {
4328 if (is_vmalloc_addr(pg_vec
[i
].buffer
))
4329 vfree(pg_vec
[i
].buffer
);
4331 free_pages((unsigned long)pg_vec
[i
].buffer
,
4333 pg_vec
[i
].buffer
= NULL
;
4339 static char *alloc_one_pg_vec_page(unsigned long order
)
4342 gfp_t gfp_flags
= GFP_KERNEL
| __GFP_COMP
|
4343 __GFP_ZERO
| __GFP_NOWARN
| __GFP_NORETRY
;
4345 buffer
= (char *) __get_free_pages(gfp_flags
, order
);
4349 /* __get_free_pages failed, fall back to vmalloc */
4350 buffer
= vzalloc(array_size((1 << order
), PAGE_SIZE
));
4354 /* vmalloc failed, lets dig into swap here */
4355 gfp_flags
&= ~__GFP_NORETRY
;
4356 buffer
= (char *) __get_free_pages(gfp_flags
, order
);
4360 /* complete and utter failure */
4364 static struct pgv
*alloc_pg_vec(struct tpacket_req
*req
, int order
)
4366 unsigned int block_nr
= req
->tp_block_nr
;
4370 pg_vec
= kcalloc(block_nr
, sizeof(struct pgv
), GFP_KERNEL
| __GFP_NOWARN
);
4371 if (unlikely(!pg_vec
))
4374 for (i
= 0; i
< block_nr
; i
++) {
4375 pg_vec
[i
].buffer
= alloc_one_pg_vec_page(order
);
4376 if (unlikely(!pg_vec
[i
].buffer
))
4377 goto out_free_pgvec
;
4384 free_pg_vec(pg_vec
, order
, block_nr
);
4389 static int packet_set_ring(struct sock
*sk
, union tpacket_req_u
*req_u
,
4390 int closing
, int tx_ring
)
4392 struct pgv
*pg_vec
= NULL
;
4393 struct packet_sock
*po
= pkt_sk(sk
);
4394 unsigned long *rx_owner_map
= NULL
;
4395 int was_running
, order
= 0;
4396 struct packet_ring_buffer
*rb
;
4397 struct sk_buff_head
*rb_queue
;
4400 /* Added to avoid minimal code churn */
4401 struct tpacket_req
*req
= &req_u
->req
;
4403 rb
= tx_ring
? &po
->tx_ring
: &po
->rx_ring
;
4404 rb_queue
= tx_ring
? &sk
->sk_write_queue
: &sk
->sk_receive_queue
;
4408 if (atomic_read(&po
->mapped
))
4410 if (packet_read_pending(rb
))
4414 if (req
->tp_block_nr
) {
4415 unsigned int min_frame_size
;
4417 /* Sanity tests and some calculations */
4419 if (unlikely(rb
->pg_vec
))
4422 switch (po
->tp_version
) {
4424 po
->tp_hdrlen
= TPACKET_HDRLEN
;
4427 po
->tp_hdrlen
= TPACKET2_HDRLEN
;
4430 po
->tp_hdrlen
= TPACKET3_HDRLEN
;
4435 if (unlikely((int)req
->tp_block_size
<= 0))
4437 if (unlikely(!PAGE_ALIGNED(req
->tp_block_size
)))
4439 min_frame_size
= po
->tp_hdrlen
+ po
->tp_reserve
;
4440 if (po
->tp_version
>= TPACKET_V3
&&
4441 req
->tp_block_size
<
4442 BLK_PLUS_PRIV((u64
)req_u
->req3
.tp_sizeof_priv
) + min_frame_size
)
4444 if (unlikely(req
->tp_frame_size
< min_frame_size
))
4446 if (unlikely(req
->tp_frame_size
& (TPACKET_ALIGNMENT
- 1)))
4449 rb
->frames_per_block
= req
->tp_block_size
/ req
->tp_frame_size
;
4450 if (unlikely(rb
->frames_per_block
== 0))
4452 if (unlikely(rb
->frames_per_block
> UINT_MAX
/ req
->tp_block_nr
))
4454 if (unlikely((rb
->frames_per_block
* req
->tp_block_nr
) !=
4459 order
= get_order(req
->tp_block_size
);
4460 pg_vec
= alloc_pg_vec(req
, order
);
4461 if (unlikely(!pg_vec
))
4463 switch (po
->tp_version
) {
4465 /* Block transmit is not supported yet */
4467 init_prb_bdqc(po
, rb
, pg_vec
, req_u
);
4469 struct tpacket_req3
*req3
= &req_u
->req3
;
4471 if (req3
->tp_retire_blk_tov
||
4472 req3
->tp_sizeof_priv
||
4473 req3
->tp_feature_req_word
) {
4475 goto out_free_pg_vec
;
4481 rx_owner_map
= bitmap_alloc(req
->tp_frame_nr
,
4482 GFP_KERNEL
| __GFP_NOWARN
| __GFP_ZERO
);
4484 goto out_free_pg_vec
;
4492 if (unlikely(req
->tp_frame_nr
))
4497 /* Detach socket from network */
4498 spin_lock(&po
->bind_lock
);
4499 was_running
= packet_sock_flag(po
, PACKET_SOCK_RUNNING
);
4502 WRITE_ONCE(po
->num
, 0);
4503 __unregister_prot_hook(sk
, false);
4505 spin_unlock(&po
->bind_lock
);
4510 mutex_lock(&po
->pg_vec_lock
);
4511 if (closing
|| atomic_read(&po
->mapped
) == 0) {
4513 spin_lock_bh(&rb_queue
->lock
);
4514 swap(rb
->pg_vec
, pg_vec
);
4515 if (po
->tp_version
<= TPACKET_V2
)
4516 swap(rb
->rx_owner_map
, rx_owner_map
);
4517 rb
->frame_max
= (req
->tp_frame_nr
- 1);
4519 rb
->frame_size
= req
->tp_frame_size
;
4520 spin_unlock_bh(&rb_queue
->lock
);
4522 swap(rb
->pg_vec_order
, order
);
4523 swap(rb
->pg_vec_len
, req
->tp_block_nr
);
4525 rb
->pg_vec_pages
= req
->tp_block_size
/PAGE_SIZE
;
4526 po
->prot_hook
.func
= (po
->rx_ring
.pg_vec
) ?
4527 tpacket_rcv
: packet_rcv
;
4528 skb_queue_purge(rb_queue
);
4529 if (atomic_read(&po
->mapped
))
4530 pr_err("packet_mmap: vma is busy: %d\n",
4531 atomic_read(&po
->mapped
));
4533 mutex_unlock(&po
->pg_vec_lock
);
4535 spin_lock(&po
->bind_lock
);
4537 WRITE_ONCE(po
->num
, num
);
4538 register_prot_hook(sk
);
4540 spin_unlock(&po
->bind_lock
);
4541 if (pg_vec
&& (po
->tp_version
> TPACKET_V2
)) {
4542 /* Because we don't support block-based V3 on tx-ring */
4544 prb_shutdown_retire_blk_timer(po
, rb_queue
);
4549 bitmap_free(rx_owner_map
);
4550 free_pg_vec(pg_vec
, order
, req
->tp_block_nr
);
4556 static int packet_mmap(struct file
*file
, struct socket
*sock
,
4557 struct vm_area_struct
*vma
)
4559 struct sock
*sk
= sock
->sk
;
4560 struct packet_sock
*po
= pkt_sk(sk
);
4561 unsigned long size
, expected_size
;
4562 struct packet_ring_buffer
*rb
;
4563 unsigned long start
;
4570 mutex_lock(&po
->pg_vec_lock
);
4573 for (rb
= &po
->rx_ring
; rb
<= &po
->tx_ring
; rb
++) {
4575 expected_size
+= rb
->pg_vec_len
4581 if (expected_size
== 0)
4584 size
= vma
->vm_end
- vma
->vm_start
;
4585 if (size
!= expected_size
)
4588 start
= vma
->vm_start
;
4589 for (rb
= &po
->rx_ring
; rb
<= &po
->tx_ring
; rb
++) {
4590 if (rb
->pg_vec
== NULL
)
4593 for (i
= 0; i
< rb
->pg_vec_len
; i
++) {
4595 void *kaddr
= rb
->pg_vec
[i
].buffer
;
4598 for (pg_num
= 0; pg_num
< rb
->pg_vec_pages
; pg_num
++) {
4599 page
= pgv_to_page(kaddr
);
4600 err
= vm_insert_page(vma
, start
, page
);
4609 atomic_inc(&po
->mapped
);
4610 vma
->vm_ops
= &packet_mmap_ops
;
4614 mutex_unlock(&po
->pg_vec_lock
);
4618 static const struct proto_ops packet_ops_spkt
= {
4619 .family
= PF_PACKET
,
4620 .owner
= THIS_MODULE
,
4621 .release
= packet_release
,
4622 .bind
= packet_bind_spkt
,
4623 .connect
= sock_no_connect
,
4624 .socketpair
= sock_no_socketpair
,
4625 .accept
= sock_no_accept
,
4626 .getname
= packet_getname_spkt
,
4627 .poll
= datagram_poll
,
4628 .ioctl
= packet_ioctl
,
4629 .gettstamp
= sock_gettstamp
,
4630 .listen
= sock_no_listen
,
4631 .shutdown
= sock_no_shutdown
,
4632 .sendmsg
= packet_sendmsg_spkt
,
4633 .recvmsg
= packet_recvmsg
,
4634 .mmap
= sock_no_mmap
,
4637 static const struct proto_ops packet_ops
= {
4638 .family
= PF_PACKET
,
4639 .owner
= THIS_MODULE
,
4640 .release
= packet_release
,
4641 .bind
= packet_bind
,
4642 .connect
= sock_no_connect
,
4643 .socketpair
= sock_no_socketpair
,
4644 .accept
= sock_no_accept
,
4645 .getname
= packet_getname
,
4646 .poll
= packet_poll
,
4647 .ioctl
= packet_ioctl
,
4648 .gettstamp
= sock_gettstamp
,
4649 .listen
= sock_no_listen
,
4650 .shutdown
= sock_no_shutdown
,
4651 .setsockopt
= packet_setsockopt
,
4652 .getsockopt
= packet_getsockopt
,
4653 .sendmsg
= packet_sendmsg
,
4654 .recvmsg
= packet_recvmsg
,
4655 .mmap
= packet_mmap
,
4658 static const struct net_proto_family packet_family_ops
= {
4659 .family
= PF_PACKET
,
4660 .create
= packet_create
,
4661 .owner
= THIS_MODULE
,
4664 static struct notifier_block packet_netdev_notifier
= {
4665 .notifier_call
= packet_notifier
,
4668 #ifdef CONFIG_PROC_FS
4670 static void *packet_seq_start(struct seq_file
*seq
, loff_t
*pos
)
4673 struct net
*net
= seq_file_net(seq
);
4676 return seq_hlist_start_head_rcu(&net
->packet
.sklist
, *pos
);
4679 static void *packet_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
4681 struct net
*net
= seq_file_net(seq
);
4682 return seq_hlist_next_rcu(v
, &net
->packet
.sklist
, pos
);
4685 static void packet_seq_stop(struct seq_file
*seq
, void *v
)
4691 static int packet_seq_show(struct seq_file
*seq
, void *v
)
4693 if (v
== SEQ_START_TOKEN
)
4695 "%*sRefCnt Type Proto Iface R Rmem User Inode\n",
4696 IS_ENABLED(CONFIG_64BIT
) ? -17 : -9, "sk");
4698 struct sock
*s
= sk_entry(v
);
4699 const struct packet_sock
*po
= pkt_sk(s
);
4702 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
4704 refcount_read(&s
->sk_refcnt
),
4706 ntohs(READ_ONCE(po
->num
)),
4707 READ_ONCE(po
->ifindex
),
4708 packet_sock_flag(po
, PACKET_SOCK_RUNNING
),
4709 atomic_read(&s
->sk_rmem_alloc
),
4710 from_kuid_munged(seq_user_ns(seq
), sock_i_uid(s
)),
4717 static const struct seq_operations packet_seq_ops
= {
4718 .start
= packet_seq_start
,
4719 .next
= packet_seq_next
,
4720 .stop
= packet_seq_stop
,
4721 .show
= packet_seq_show
,
4725 static int __net_init
packet_net_init(struct net
*net
)
4727 mutex_init(&net
->packet
.sklist_lock
);
4728 INIT_HLIST_HEAD(&net
->packet
.sklist
);
4730 #ifdef CONFIG_PROC_FS
4731 if (!proc_create_net("packet", 0, net
->proc_net
, &packet_seq_ops
,
4732 sizeof(struct seq_net_private
)))
4734 #endif /* CONFIG_PROC_FS */
4739 static void __net_exit
packet_net_exit(struct net
*net
)
4741 remove_proc_entry("packet", net
->proc_net
);
4742 WARN_ON_ONCE(!hlist_empty(&net
->packet
.sklist
));
4745 static struct pernet_operations packet_net_ops
= {
4746 .init
= packet_net_init
,
4747 .exit
= packet_net_exit
,
4751 static void __exit
packet_exit(void)
4753 sock_unregister(PF_PACKET
);
4754 proto_unregister(&packet_proto
);
4755 unregister_netdevice_notifier(&packet_netdev_notifier
);
4756 unregister_pernet_subsys(&packet_net_ops
);
4759 static int __init
packet_init(void)
4763 rc
= register_pernet_subsys(&packet_net_ops
);
4766 rc
= register_netdevice_notifier(&packet_netdev_notifier
);
4769 rc
= proto_register(&packet_proto
, 0);
4772 rc
= sock_register(&packet_family_ops
);
4779 proto_unregister(&packet_proto
);
4781 unregister_netdevice_notifier(&packet_netdev_notifier
);
4783 unregister_pernet_subsys(&packet_net_ops
);
4788 module_init(packet_init
);
4789 module_exit(packet_exit
);
4790 MODULE_LICENSE("GPL");
4791 MODULE_ALIAS_NETPROTO(PF_PACKET
);