1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (c) 2009, Microsoft Corporation.
6 * Haiyang Zhang <haiyangz@microsoft.com>
7 * Hank Janssen <hjanssen@microsoft.com>
9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11 #include <linux/init.h>
12 #include <linux/atomic.h>
13 #include <linux/module.h>
14 #include <linux/highmem.h>
15 #include <linux/device.h>
17 #include <linux/delay.h>
18 #include <linux/netdevice.h>
19 #include <linux/inetdevice.h>
20 #include <linux/etherdevice.h>
21 #include <linux/pci.h>
22 #include <linux/skbuff.h>
23 #include <linux/if_vlan.h>
25 #include <linux/slab.h>
26 #include <linux/rtnetlink.h>
27 #include <linux/netpoll.h>
28 #include <linux/bpf.h>
31 #include <net/route.h>
33 #include <net/pkt_sched.h>
34 #include <net/checksum.h>
35 #include <net/ip6_checksum.h>
37 #include "hyperv_net.h"
39 #define RING_SIZE_MIN 64
40 #define RETRY_US_LO 5000
41 #define RETRY_US_HI 10000
42 #define RETRY_MAX 2000 /* >10 sec */
44 #define LINKCHANGE_INT (2 * HZ)
45 #define VF_TAKEOVER_INT (HZ / 10)
47 static unsigned int ring_size __ro_after_init
= 128;
48 module_param(ring_size
, uint
, 0444);
49 MODULE_PARM_DESC(ring_size
, "Ring buffer size (# of pages)");
50 unsigned int netvsc_ring_bytes __ro_after_init
;
52 static const u32 default_msg
= NETIF_MSG_DRV
| NETIF_MSG_PROBE
|
53 NETIF_MSG_LINK
| NETIF_MSG_IFUP
|
54 NETIF_MSG_IFDOWN
| NETIF_MSG_RX_ERR
|
57 static int debug
= -1;
58 module_param(debug
, int, 0444);
59 MODULE_PARM_DESC(debug
, "Debug level (0=none,...,16=all)");
61 static LIST_HEAD(netvsc_dev_list
);
63 static void netvsc_change_rx_flags(struct net_device
*net
, int change
)
65 struct net_device_context
*ndev_ctx
= netdev_priv(net
);
66 struct net_device
*vf_netdev
= rtnl_dereference(ndev_ctx
->vf_netdev
);
72 if (change
& IFF_PROMISC
) {
73 inc
= (net
->flags
& IFF_PROMISC
) ? 1 : -1;
74 dev_set_promiscuity(vf_netdev
, inc
);
77 if (change
& IFF_ALLMULTI
) {
78 inc
= (net
->flags
& IFF_ALLMULTI
) ? 1 : -1;
79 dev_set_allmulti(vf_netdev
, inc
);
83 static void netvsc_set_rx_mode(struct net_device
*net
)
85 struct net_device_context
*ndev_ctx
= netdev_priv(net
);
86 struct net_device
*vf_netdev
;
87 struct netvsc_device
*nvdev
;
90 vf_netdev
= rcu_dereference(ndev_ctx
->vf_netdev
);
92 dev_uc_sync(vf_netdev
, net
);
93 dev_mc_sync(vf_netdev
, net
);
96 nvdev
= rcu_dereference(ndev_ctx
->nvdev
);
98 rndis_filter_update(nvdev
);
102 static void netvsc_tx_enable(struct netvsc_device
*nvscdev
,
103 struct net_device
*ndev
)
105 nvscdev
->tx_disable
= false;
106 virt_wmb(); /* ensure queue wake up mechanism is on */
108 netif_tx_wake_all_queues(ndev
);
111 static int netvsc_open(struct net_device
*net
)
113 struct net_device_context
*ndev_ctx
= netdev_priv(net
);
114 struct net_device
*vf_netdev
= rtnl_dereference(ndev_ctx
->vf_netdev
);
115 struct netvsc_device
*nvdev
= rtnl_dereference(ndev_ctx
->nvdev
);
116 struct rndis_device
*rdev
;
119 netif_carrier_off(net
);
121 /* Open up the device */
122 ret
= rndis_filter_open(nvdev
);
124 netdev_err(net
, "unable to open device (ret %d).\n", ret
);
128 rdev
= nvdev
->extension
;
129 if (!rdev
->link_state
) {
130 netif_carrier_on(net
);
131 netvsc_tx_enable(nvdev
, net
);
135 /* Setting synthetic device up transparently sets
136 * slave as up. If open fails, then slave will be
137 * still be offline (and not used).
139 ret
= dev_open(vf_netdev
, NULL
);
142 "unable to open slave: %s: %d\n",
143 vf_netdev
->name
, ret
);
148 static int netvsc_wait_until_empty(struct netvsc_device
*nvdev
)
150 unsigned int retry
= 0;
153 /* Ensure pending bytes in ring are read */
157 for (i
= 0; i
< nvdev
->num_chn
; i
++) {
158 struct vmbus_channel
*chn
159 = nvdev
->chan_table
[i
].channel
;
164 /* make sure receive not running now */
165 napi_synchronize(&nvdev
->chan_table
[i
].napi
);
167 aread
= hv_get_bytes_to_read(&chn
->inbound
);
171 aread
= hv_get_bytes_to_read(&chn
->outbound
);
179 if (++retry
> RETRY_MAX
)
182 usleep_range(RETRY_US_LO
, RETRY_US_HI
);
186 static void netvsc_tx_disable(struct netvsc_device
*nvscdev
,
187 struct net_device
*ndev
)
190 nvscdev
->tx_disable
= true;
191 virt_wmb(); /* ensure txq will not wake up after stop */
194 netif_tx_disable(ndev
);
197 static int netvsc_close(struct net_device
*net
)
199 struct net_device_context
*net_device_ctx
= netdev_priv(net
);
200 struct net_device
*vf_netdev
201 = rtnl_dereference(net_device_ctx
->vf_netdev
);
202 struct netvsc_device
*nvdev
= rtnl_dereference(net_device_ctx
->nvdev
);
205 netvsc_tx_disable(nvdev
, net
);
207 /* No need to close rndis filter if it is removed already */
211 ret
= rndis_filter_close(nvdev
);
213 netdev_err(net
, "unable to close device (ret %d).\n", ret
);
217 ret
= netvsc_wait_until_empty(nvdev
);
219 netdev_err(net
, "Ring buffer not empty after closing rndis\n");
222 dev_close(vf_netdev
);
227 static inline void *init_ppi_data(struct rndis_message
*msg
,
228 u32 ppi_size
, u32 pkt_type
)
230 struct rndis_packet
*rndis_pkt
= &msg
->msg
.pkt
;
231 struct rndis_per_packet_info
*ppi
;
233 rndis_pkt
->data_offset
+= ppi_size
;
234 ppi
= (void *)rndis_pkt
+ rndis_pkt
->per_pkt_info_offset
235 + rndis_pkt
->per_pkt_info_len
;
237 ppi
->size
= ppi_size
;
238 ppi
->type
= pkt_type
;
240 ppi
->ppi_offset
= sizeof(struct rndis_per_packet_info
);
242 rndis_pkt
->per_pkt_info_len
+= ppi_size
;
247 /* Azure hosts don't support non-TCP port numbers in hashing for fragmented
248 * packets. We can use ethtool to change UDP hash level when necessary.
250 static inline u32
netvsc_get_hash(
252 const struct net_device_context
*ndc
)
254 struct flow_keys flow
;
255 u32 hash
, pkt_proto
= 0;
256 static u32 hashrnd __read_mostly
;
258 net_get_random_once(&hashrnd
, sizeof(hashrnd
));
260 if (!skb_flow_dissect_flow_keys(skb
, &flow
, 0))
263 switch (flow
.basic
.ip_proto
) {
265 if (flow
.basic
.n_proto
== htons(ETH_P_IP
))
266 pkt_proto
= HV_TCP4_L4HASH
;
267 else if (flow
.basic
.n_proto
== htons(ETH_P_IPV6
))
268 pkt_proto
= HV_TCP6_L4HASH
;
273 if (flow
.basic
.n_proto
== htons(ETH_P_IP
))
274 pkt_proto
= HV_UDP4_L4HASH
;
275 else if (flow
.basic
.n_proto
== htons(ETH_P_IPV6
))
276 pkt_proto
= HV_UDP6_L4HASH
;
281 if (pkt_proto
& ndc
->l4_hash
) {
282 return skb_get_hash(skb
);
284 if (flow
.basic
.n_proto
== htons(ETH_P_IP
))
285 hash
= jhash2((u32
*)&flow
.addrs
.v4addrs
, 2, hashrnd
);
286 else if (flow
.basic
.n_proto
== htons(ETH_P_IPV6
))
287 hash
= jhash2((u32
*)&flow
.addrs
.v6addrs
, 8, hashrnd
);
291 __skb_set_sw_hash(skb
, hash
, false);
297 static inline int netvsc_get_tx_queue(struct net_device
*ndev
,
298 struct sk_buff
*skb
, int old_idx
)
300 const struct net_device_context
*ndc
= netdev_priv(ndev
);
301 struct sock
*sk
= skb
->sk
;
304 q_idx
= ndc
->tx_table
[netvsc_get_hash(skb
, ndc
) &
305 (VRSS_SEND_TAB_SIZE
- 1)];
307 /* If queue index changed record the new value */
308 if (q_idx
!= old_idx
&&
309 sk
&& sk_fullsock(sk
) && rcu_access_pointer(sk
->sk_dst_cache
))
310 sk_tx_queue_set(sk
, q_idx
);
316 * Select queue for transmit.
318 * If a valid queue has already been assigned, then use that.
319 * Otherwise compute tx queue based on hash and the send table.
321 * This is basically similar to default (netdev_pick_tx) with the added step
322 * of using the host send_table when no other queue has been assigned.
324 * TODO support XPS - but get_xps_queue not exported
326 static u16
netvsc_pick_tx(struct net_device
*ndev
, struct sk_buff
*skb
)
328 int q_idx
= sk_tx_queue_get(skb
->sk
);
330 if (q_idx
< 0 || skb
->ooo_okay
|| q_idx
>= ndev
->real_num_tx_queues
) {
331 /* If forwarding a packet, we use the recorded queue when
332 * available for better cache locality.
334 if (skb_rx_queue_recorded(skb
))
335 q_idx
= skb_get_rx_queue(skb
);
337 q_idx
= netvsc_get_tx_queue(ndev
, skb
, q_idx
);
343 static u16
netvsc_select_queue(struct net_device
*ndev
, struct sk_buff
*skb
,
344 struct net_device
*sb_dev
)
346 struct net_device_context
*ndc
= netdev_priv(ndev
);
347 struct net_device
*vf_netdev
;
351 vf_netdev
= rcu_dereference(ndc
->vf_netdev
);
353 const struct net_device_ops
*vf_ops
= vf_netdev
->netdev_ops
;
355 if (vf_ops
->ndo_select_queue
)
356 txq
= vf_ops
->ndo_select_queue(vf_netdev
, skb
, sb_dev
);
358 txq
= netdev_pick_tx(vf_netdev
, skb
, NULL
);
360 /* Record the queue selected by VF so that it can be
361 * used for common case where VF has more queues than
362 * the synthetic device.
364 qdisc_skb_cb(skb
)->slave_dev_queue_mapping
= txq
;
366 txq
= netvsc_pick_tx(ndev
, skb
);
370 while (unlikely(txq
>= ndev
->real_num_tx_queues
))
371 txq
-= ndev
->real_num_tx_queues
;
376 static u32
fill_pg_buf(struct page
*page
, u32 offset
, u32 len
,
377 struct hv_page_buffer
*pb
)
381 /* Deal with compound pages by ignoring unused part
384 page
+= (offset
>> PAGE_SHIFT
);
385 offset
&= ~PAGE_MASK
;
390 bytes
= PAGE_SIZE
- offset
;
393 pb
[j
].pfn
= page_to_pfn(page
);
394 pb
[j
].offset
= offset
;
400 if (offset
== PAGE_SIZE
&& len
) {
410 static u32
init_page_array(void *hdr
, u32 len
, struct sk_buff
*skb
,
411 struct hv_netvsc_packet
*packet
,
412 struct hv_page_buffer
*pb
)
415 char *data
= skb
->data
;
416 int frags
= skb_shinfo(skb
)->nr_frags
;
419 /* The packet is laid out thus:
420 * 1. hdr: RNDIS header and PPI
422 * 3. skb fragment data
424 slots_used
+= fill_pg_buf(virt_to_page(hdr
),
426 len
, &pb
[slots_used
]);
428 packet
->rmsg_size
= len
;
429 packet
->rmsg_pgcnt
= slots_used
;
431 slots_used
+= fill_pg_buf(virt_to_page(data
),
432 offset_in_page(data
),
433 skb_headlen(skb
), &pb
[slots_used
]);
435 for (i
= 0; i
< frags
; i
++) {
436 skb_frag_t
*frag
= skb_shinfo(skb
)->frags
+ i
;
438 slots_used
+= fill_pg_buf(skb_frag_page(frag
),
440 skb_frag_size(frag
), &pb
[slots_used
]);
445 static int count_skb_frag_slots(struct sk_buff
*skb
)
447 int i
, frags
= skb_shinfo(skb
)->nr_frags
;
450 for (i
= 0; i
< frags
; i
++) {
451 skb_frag_t
*frag
= skb_shinfo(skb
)->frags
+ i
;
452 unsigned long size
= skb_frag_size(frag
);
453 unsigned long offset
= skb_frag_off(frag
);
455 /* Skip unused frames from start of page */
456 offset
&= ~PAGE_MASK
;
457 pages
+= PFN_UP(offset
+ size
);
462 static int netvsc_get_slots(struct sk_buff
*skb
)
464 char *data
= skb
->data
;
465 unsigned int offset
= offset_in_page(data
);
466 unsigned int len
= skb_headlen(skb
);
470 slots
= DIV_ROUND_UP(offset
+ len
, PAGE_SIZE
);
471 frag_slots
= count_skb_frag_slots(skb
);
472 return slots
+ frag_slots
;
475 static u32
net_checksum_info(struct sk_buff
*skb
)
477 if (skb
->protocol
== htons(ETH_P_IP
)) {
478 struct iphdr
*ip
= ip_hdr(skb
);
480 if (ip
->protocol
== IPPROTO_TCP
)
481 return TRANSPORT_INFO_IPV4_TCP
;
482 else if (ip
->protocol
== IPPROTO_UDP
)
483 return TRANSPORT_INFO_IPV4_UDP
;
485 struct ipv6hdr
*ip6
= ipv6_hdr(skb
);
487 if (ip6
->nexthdr
== IPPROTO_TCP
)
488 return TRANSPORT_INFO_IPV6_TCP
;
489 else if (ip6
->nexthdr
== IPPROTO_UDP
)
490 return TRANSPORT_INFO_IPV6_UDP
;
493 return TRANSPORT_INFO_NOT_IP
;
496 /* Send skb on the slave VF device. */
497 static int netvsc_vf_xmit(struct net_device
*net
, struct net_device
*vf_netdev
,
500 struct net_device_context
*ndev_ctx
= netdev_priv(net
);
501 unsigned int len
= skb
->len
;
504 skb
->dev
= vf_netdev
;
505 skb
->queue_mapping
= qdisc_skb_cb(skb
)->slave_dev_queue_mapping
;
507 rc
= dev_queue_xmit(skb
);
508 if (likely(rc
== NET_XMIT_SUCCESS
|| rc
== NET_XMIT_CN
)) {
509 struct netvsc_vf_pcpu_stats
*pcpu_stats
510 = this_cpu_ptr(ndev_ctx
->vf_stats
);
512 u64_stats_update_begin(&pcpu_stats
->syncp
);
513 pcpu_stats
->tx_packets
++;
514 pcpu_stats
->tx_bytes
+= len
;
515 u64_stats_update_end(&pcpu_stats
->syncp
);
517 this_cpu_inc(ndev_ctx
->vf_stats
->tx_dropped
);
523 static int netvsc_xmit(struct sk_buff
*skb
, struct net_device
*net
, bool xdp_tx
)
525 struct net_device_context
*net_device_ctx
= netdev_priv(net
);
526 struct hv_netvsc_packet
*packet
= NULL
;
528 unsigned int num_data_pgs
;
529 struct rndis_message
*rndis_msg
;
530 struct net_device
*vf_netdev
;
533 struct hv_page_buffer pb
[MAX_PAGE_BUFFER_COUNT
];
535 /* if VF is present and up then redirect packets
536 * already called with rcu_read_lock_bh
538 vf_netdev
= rcu_dereference_bh(net_device_ctx
->vf_netdev
);
539 if (vf_netdev
&& netif_running(vf_netdev
) &&
540 !netpoll_tx_running(net
))
541 return netvsc_vf_xmit(net
, vf_netdev
, skb
);
543 /* We will atmost need two pages to describe the rndis
544 * header. We can only transmit MAX_PAGE_BUFFER_COUNT number
545 * of pages in a single packet. If skb is scattered around
546 * more pages we try linearizing it.
549 num_data_pgs
= netvsc_get_slots(skb
) + 2;
551 if (unlikely(num_data_pgs
> MAX_PAGE_BUFFER_COUNT
)) {
552 ++net_device_ctx
->eth_stats
.tx_scattered
;
554 if (skb_linearize(skb
))
557 num_data_pgs
= netvsc_get_slots(skb
) + 2;
558 if (num_data_pgs
> MAX_PAGE_BUFFER_COUNT
) {
559 ++net_device_ctx
->eth_stats
.tx_too_big
;
565 * Place the rndis header in the skb head room and
566 * the skb->cb will be used for hv_netvsc_packet
569 ret
= skb_cow_head(skb
, RNDIS_AND_PPI_SIZE
);
573 /* Use the skb control buffer for building up the packet */
574 BUILD_BUG_ON(sizeof(struct hv_netvsc_packet
) >
575 sizeof_field(struct sk_buff
, cb
));
576 packet
= (struct hv_netvsc_packet
*)skb
->cb
;
578 packet
->q_idx
= skb_get_queue_mapping(skb
);
580 packet
->total_data_buflen
= skb
->len
;
581 packet
->total_bytes
= skb
->len
;
582 packet
->total_packets
= 1;
584 rndis_msg
= (struct rndis_message
*)skb
->head
;
586 /* Add the rndis header */
587 rndis_msg
->ndis_msg_type
= RNDIS_MSG_PACKET
;
588 rndis_msg
->msg_len
= packet
->total_data_buflen
;
590 rndis_msg
->msg
.pkt
= (struct rndis_packet
) {
591 .data_offset
= sizeof(struct rndis_packet
),
592 .data_len
= packet
->total_data_buflen
,
593 .per_pkt_info_offset
= sizeof(struct rndis_packet
),
596 rndis_msg_size
= RNDIS_MESSAGE_SIZE(struct rndis_packet
);
598 hash
= skb_get_hash_raw(skb
);
599 if (hash
!= 0 && net
->real_num_tx_queues
> 1) {
602 rndis_msg_size
+= NDIS_HASH_PPI_SIZE
;
603 hash_info
= init_ppi_data(rndis_msg
, NDIS_HASH_PPI_SIZE
,
608 if (skb_vlan_tag_present(skb
)) {
609 struct ndis_pkt_8021q_info
*vlan
;
611 rndis_msg_size
+= NDIS_VLAN_PPI_SIZE
;
612 vlan
= init_ppi_data(rndis_msg
, NDIS_VLAN_PPI_SIZE
,
616 vlan
->vlanid
= skb_vlan_tag_get_id(skb
);
617 vlan
->cfi
= skb_vlan_tag_get_cfi(skb
);
618 vlan
->pri
= skb_vlan_tag_get_prio(skb
);
621 if (skb_is_gso(skb
)) {
622 struct ndis_tcp_lso_info
*lso_info
;
624 rndis_msg_size
+= NDIS_LSO_PPI_SIZE
;
625 lso_info
= init_ppi_data(rndis_msg
, NDIS_LSO_PPI_SIZE
,
626 TCP_LARGESEND_PKTINFO
);
629 lso_info
->lso_v2_transmit
.type
= NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE
;
630 if (skb
->protocol
== htons(ETH_P_IP
)) {
631 lso_info
->lso_v2_transmit
.ip_version
=
632 NDIS_TCP_LARGE_SEND_OFFLOAD_IPV4
;
633 ip_hdr(skb
)->tot_len
= 0;
634 ip_hdr(skb
)->check
= 0;
635 tcp_hdr(skb
)->check
=
636 ~csum_tcpudp_magic(ip_hdr(skb
)->saddr
,
637 ip_hdr(skb
)->daddr
, 0, IPPROTO_TCP
, 0);
639 lso_info
->lso_v2_transmit
.ip_version
=
640 NDIS_TCP_LARGE_SEND_OFFLOAD_IPV6
;
641 tcp_v6_gso_csum_prep(skb
);
643 lso_info
->lso_v2_transmit
.tcp_header_offset
= skb_transport_offset(skb
);
644 lso_info
->lso_v2_transmit
.mss
= skb_shinfo(skb
)->gso_size
;
645 } else if (skb
->ip_summed
== CHECKSUM_PARTIAL
) {
646 if (net_checksum_info(skb
) & net_device_ctx
->tx_checksum_mask
) {
647 struct ndis_tcp_ip_checksum_info
*csum_info
;
649 rndis_msg_size
+= NDIS_CSUM_PPI_SIZE
;
650 csum_info
= init_ppi_data(rndis_msg
, NDIS_CSUM_PPI_SIZE
,
651 TCPIP_CHKSUM_PKTINFO
);
653 csum_info
->value
= 0;
654 csum_info
->transmit
.tcp_header_offset
= skb_transport_offset(skb
);
656 if (skb
->protocol
== htons(ETH_P_IP
)) {
657 csum_info
->transmit
.is_ipv4
= 1;
659 if (ip_hdr(skb
)->protocol
== IPPROTO_TCP
)
660 csum_info
->transmit
.tcp_checksum
= 1;
662 csum_info
->transmit
.udp_checksum
= 1;
664 csum_info
->transmit
.is_ipv6
= 1;
666 if (ipv6_hdr(skb
)->nexthdr
== IPPROTO_TCP
)
667 csum_info
->transmit
.tcp_checksum
= 1;
669 csum_info
->transmit
.udp_checksum
= 1;
672 /* Can't do offload of this type of checksum */
673 if (skb_checksum_help(skb
))
678 /* Start filling in the page buffers with the rndis hdr */
679 rndis_msg
->msg_len
+= rndis_msg_size
;
680 packet
->total_data_buflen
= rndis_msg
->msg_len
;
681 packet
->page_buf_cnt
= init_page_array(rndis_msg
, rndis_msg_size
,
684 /* timestamp packet in software */
685 skb_tx_timestamp(skb
);
687 ret
= netvsc_send(net
, packet
, rndis_msg
, pb
, skb
, xdp_tx
);
688 if (likely(ret
== 0))
691 if (ret
== -EAGAIN
) {
692 ++net_device_ctx
->eth_stats
.tx_busy
;
693 return NETDEV_TX_BUSY
;
697 ++net_device_ctx
->eth_stats
.tx_no_space
;
700 dev_kfree_skb_any(skb
);
701 net
->stats
.tx_dropped
++;
706 ++net_device_ctx
->eth_stats
.tx_no_memory
;
710 static int netvsc_start_xmit(struct sk_buff
*skb
, struct net_device
*ndev
)
712 return netvsc_xmit(skb
, ndev
, false);
716 * netvsc_linkstatus_callback - Link up/down notification
718 void netvsc_linkstatus_callback(struct net_device
*net
,
719 struct rndis_message
*resp
)
721 struct rndis_indicate_status
*indicate
= &resp
->msg
.indicate_status
;
722 struct net_device_context
*ndev_ctx
= netdev_priv(net
);
723 struct netvsc_reconfig
*event
;
726 /* Update the physical link speed when changing to another vSwitch */
727 if (indicate
->status
== RNDIS_STATUS_LINK_SPEED_CHANGE
) {
730 speed
= *(u32
*)((void *)indicate
731 + indicate
->status_buf_offset
) / 10000;
732 ndev_ctx
->speed
= speed
;
736 /* Handle these link change statuses below */
737 if (indicate
->status
!= RNDIS_STATUS_NETWORK_CHANGE
&&
738 indicate
->status
!= RNDIS_STATUS_MEDIA_CONNECT
&&
739 indicate
->status
!= RNDIS_STATUS_MEDIA_DISCONNECT
)
742 if (net
->reg_state
!= NETREG_REGISTERED
)
745 event
= kzalloc(sizeof(*event
), GFP_ATOMIC
);
748 event
->event
= indicate
->status
;
750 spin_lock_irqsave(&ndev_ctx
->lock
, flags
);
751 list_add_tail(&event
->list
, &ndev_ctx
->reconfig_events
);
752 spin_unlock_irqrestore(&ndev_ctx
->lock
, flags
);
754 schedule_delayed_work(&ndev_ctx
->dwork
, 0);
757 static void netvsc_xdp_xmit(struct sk_buff
*skb
, struct net_device
*ndev
)
761 skb
->queue_mapping
= skb_get_rx_queue(skb
);
762 __skb_push(skb
, ETH_HLEN
);
764 rc
= netvsc_xmit(skb
, ndev
, true);
766 if (dev_xmit_complete(rc
))
769 dev_kfree_skb_any(skb
);
770 ndev
->stats
.tx_dropped
++;
773 static void netvsc_comp_ipcsum(struct sk_buff
*skb
)
775 struct iphdr
*iph
= (struct iphdr
*)skb
->data
;
778 iph
->check
= ip_fast_csum(iph
, iph
->ihl
);
781 static struct sk_buff
*netvsc_alloc_recv_skb(struct net_device
*net
,
782 struct netvsc_channel
*nvchan
,
783 struct xdp_buff
*xdp
)
785 struct napi_struct
*napi
= &nvchan
->napi
;
786 const struct ndis_pkt_8021q_info
*vlan
= nvchan
->rsc
.vlan
;
787 const struct ndis_tcp_ip_checksum_info
*csum_info
=
788 nvchan
->rsc
.csum_info
;
789 const u32
*hash_info
= nvchan
->rsc
.hash_info
;
791 void *xbuf
= xdp
->data_hard_start
;
795 unsigned int hdroom
= xdp
->data
- xdp
->data_hard_start
;
796 unsigned int xlen
= xdp
->data_end
- xdp
->data
;
797 unsigned int frag_size
= netvsc_xdp_fraglen(hdroom
+ xlen
);
799 skb
= build_skb(xbuf
, frag_size
);
802 __free_page(virt_to_page(xbuf
));
806 skb_reserve(skb
, hdroom
);
808 skb
->dev
= napi
->dev
;
810 skb
= napi_alloc_skb(napi
, nvchan
->rsc
.pktlen
);
815 /* Copy to skb. This copy is needed here since the memory
816 * pointed by hv_netvsc_packet cannot be deallocated.
818 for (i
= 0; i
< nvchan
->rsc
.cnt
; i
++)
819 skb_put_data(skb
, nvchan
->rsc
.data
[i
],
823 skb
->protocol
= eth_type_trans(skb
, net
);
825 /* skb is already created with CHECKSUM_NONE */
826 skb_checksum_none_assert(skb
);
828 /* Incoming packets may have IP header checksum verified by the host.
829 * They may not have IP header checksum computed after coalescing.
830 * We compute it here if the flags are set, because on Linux, the IP
831 * checksum is always checked.
833 if (csum_info
&& csum_info
->receive
.ip_checksum_value_invalid
&&
834 csum_info
->receive
.ip_checksum_succeeded
&&
835 skb
->protocol
== htons(ETH_P_IP
))
836 netvsc_comp_ipcsum(skb
);
838 /* Do L4 checksum offload if enabled and present. */
839 if (csum_info
&& (net
->features
& NETIF_F_RXCSUM
)) {
840 if (csum_info
->receive
.tcp_checksum_succeeded
||
841 csum_info
->receive
.udp_checksum_succeeded
)
842 skb
->ip_summed
= CHECKSUM_UNNECESSARY
;
845 if (hash_info
&& (net
->features
& NETIF_F_RXHASH
))
846 skb_set_hash(skb
, *hash_info
, PKT_HASH_TYPE_L4
);
849 u16 vlan_tci
= vlan
->vlanid
| (vlan
->pri
<< VLAN_PRIO_SHIFT
) |
850 (vlan
->cfi
? VLAN_CFI_MASK
: 0);
852 __vlan_hwaccel_put_tag(skb
, htons(ETH_P_8021Q
),
860 * netvsc_recv_callback - Callback when we receive a packet from the
861 * "wire" on the specified device.
863 int netvsc_recv_callback(struct net_device
*net
,
864 struct netvsc_device
*net_device
,
865 struct netvsc_channel
*nvchan
)
867 struct net_device_context
*net_device_ctx
= netdev_priv(net
);
868 struct vmbus_channel
*channel
= nvchan
->channel
;
869 u16 q_idx
= channel
->offermsg
.offer
.sub_channel_index
;
871 struct netvsc_stats
*rx_stats
= &nvchan
->rx_stats
;
875 if (net
->reg_state
!= NETREG_REGISTERED
)
876 return NVSP_STAT_FAIL
;
878 act
= netvsc_run_xdp(net
, nvchan
, &xdp
);
880 if (act
!= XDP_PASS
&& act
!= XDP_TX
) {
881 u64_stats_update_begin(&rx_stats
->syncp
);
882 rx_stats
->xdp_drop
++;
883 u64_stats_update_end(&rx_stats
->syncp
);
885 return NVSP_STAT_SUCCESS
; /* consumed by XDP */
888 /* Allocate a skb - TODO direct I/O to pages? */
889 skb
= netvsc_alloc_recv_skb(net
, nvchan
, &xdp
);
891 if (unlikely(!skb
)) {
892 ++net_device_ctx
->eth_stats
.rx_no_memory
;
893 return NVSP_STAT_FAIL
;
896 skb_record_rx_queue(skb
, q_idx
);
899 * Even if injecting the packet, record the statistics
900 * on the synthetic device because modifying the VF device
901 * statistics will not work correctly.
903 u64_stats_update_begin(&rx_stats
->syncp
);
905 rx_stats
->bytes
+= nvchan
->rsc
.pktlen
;
907 if (skb
->pkt_type
== PACKET_BROADCAST
)
908 ++rx_stats
->broadcast
;
909 else if (skb
->pkt_type
== PACKET_MULTICAST
)
910 ++rx_stats
->multicast
;
911 u64_stats_update_end(&rx_stats
->syncp
);
914 netvsc_xdp_xmit(skb
, net
);
915 return NVSP_STAT_SUCCESS
;
918 napi_gro_receive(&nvchan
->napi
, skb
);
919 return NVSP_STAT_SUCCESS
;
922 static void netvsc_get_drvinfo(struct net_device
*net
,
923 struct ethtool_drvinfo
*info
)
925 strlcpy(info
->driver
, KBUILD_MODNAME
, sizeof(info
->driver
));
926 strlcpy(info
->fw_version
, "N/A", sizeof(info
->fw_version
));
929 static void netvsc_get_channels(struct net_device
*net
,
930 struct ethtool_channels
*channel
)
932 struct net_device_context
*net_device_ctx
= netdev_priv(net
);
933 struct netvsc_device
*nvdev
= rtnl_dereference(net_device_ctx
->nvdev
);
936 channel
->max_combined
= nvdev
->max_chn
;
937 channel
->combined_count
= nvdev
->num_chn
;
941 /* Alloc struct netvsc_device_info, and initialize it from either existing
942 * struct netvsc_device, or from default values.
945 struct netvsc_device_info
*netvsc_devinfo_get(struct netvsc_device
*nvdev
)
947 struct netvsc_device_info
*dev_info
;
948 struct bpf_prog
*prog
;
950 dev_info
= kzalloc(sizeof(*dev_info
), GFP_ATOMIC
);
958 dev_info
->num_chn
= nvdev
->num_chn
;
959 dev_info
->send_sections
= nvdev
->send_section_cnt
;
960 dev_info
->send_section_size
= nvdev
->send_section_size
;
961 dev_info
->recv_sections
= nvdev
->recv_section_cnt
;
962 dev_info
->recv_section_size
= nvdev
->recv_section_size
;
964 memcpy(dev_info
->rss_key
, nvdev
->extension
->rss_key
,
967 prog
= netvsc_xdp_get(nvdev
);
970 dev_info
->bprog
= prog
;
973 dev_info
->num_chn
= VRSS_CHANNEL_DEFAULT
;
974 dev_info
->send_sections
= NETVSC_DEFAULT_TX
;
975 dev_info
->send_section_size
= NETVSC_SEND_SECTION_SIZE
;
976 dev_info
->recv_sections
= NETVSC_DEFAULT_RX
;
977 dev_info
->recv_section_size
= NETVSC_RECV_SECTION_SIZE
;
983 /* Free struct netvsc_device_info */
984 static void netvsc_devinfo_put(struct netvsc_device_info
*dev_info
)
986 if (dev_info
->bprog
) {
988 bpf_prog_put(dev_info
->bprog
);
994 static int netvsc_detach(struct net_device
*ndev
,
995 struct netvsc_device
*nvdev
)
997 struct net_device_context
*ndev_ctx
= netdev_priv(ndev
);
998 struct hv_device
*hdev
= ndev_ctx
->device_ctx
;
1001 /* Don't try continuing to try and setup sub channels */
1002 if (cancel_work_sync(&nvdev
->subchan_work
))
1005 netvsc_xdp_set(ndev
, NULL
, NULL
, nvdev
);
1007 /* If device was up (receiving) then shutdown */
1008 if (netif_running(ndev
)) {
1009 netvsc_tx_disable(nvdev
, ndev
);
1011 ret
= rndis_filter_close(nvdev
);
1014 "unable to close device (ret %d).\n", ret
);
1018 ret
= netvsc_wait_until_empty(nvdev
);
1021 "Ring buffer not empty after closing rndis\n");
1026 netif_device_detach(ndev
);
1028 rndis_filter_device_remove(hdev
, nvdev
);
1033 static int netvsc_attach(struct net_device
*ndev
,
1034 struct netvsc_device_info
*dev_info
)
1036 struct net_device_context
*ndev_ctx
= netdev_priv(ndev
);
1037 struct hv_device
*hdev
= ndev_ctx
->device_ctx
;
1038 struct netvsc_device
*nvdev
;
1039 struct rndis_device
*rdev
;
1040 struct bpf_prog
*prog
;
1043 nvdev
= rndis_filter_device_add(hdev
, dev_info
);
1045 return PTR_ERR(nvdev
);
1047 if (nvdev
->num_chn
> 1) {
1048 ret
= rndis_set_subchannel(ndev
, nvdev
, dev_info
);
1050 /* if unavailable, just proceed with one queue */
1057 prog
= dev_info
->bprog
;
1060 ret
= netvsc_xdp_set(ndev
, prog
, NULL
, nvdev
);
1067 /* In any case device is now ready */
1068 nvdev
->tx_disable
= false;
1069 netif_device_attach(ndev
);
1071 /* Note: enable and attach happen when sub-channels setup */
1072 netif_carrier_off(ndev
);
1074 if (netif_running(ndev
)) {
1075 ret
= rndis_filter_open(nvdev
);
1079 rdev
= nvdev
->extension
;
1080 if (!rdev
->link_state
)
1081 netif_carrier_on(ndev
);
1087 netif_device_detach(ndev
);
1090 rndis_filter_device_remove(hdev
, nvdev
);
1095 static int netvsc_set_channels(struct net_device
*net
,
1096 struct ethtool_channels
*channels
)
1098 struct net_device_context
*net_device_ctx
= netdev_priv(net
);
1099 struct netvsc_device
*nvdev
= rtnl_dereference(net_device_ctx
->nvdev
);
1100 unsigned int orig
, count
= channels
->combined_count
;
1101 struct netvsc_device_info
*device_info
;
1104 /* We do not support separate count for rx, tx, or other */
1106 channels
->rx_count
|| channels
->tx_count
|| channels
->other_count
)
1109 if (!nvdev
|| nvdev
->destroy
)
1112 if (nvdev
->nvsp_version
< NVSP_PROTOCOL_VERSION_5
)
1115 if (count
> nvdev
->max_chn
)
1118 orig
= nvdev
->num_chn
;
1120 device_info
= netvsc_devinfo_get(nvdev
);
1125 device_info
->num_chn
= count
;
1127 ret
= netvsc_detach(net
, nvdev
);
1131 ret
= netvsc_attach(net
, device_info
);
1133 device_info
->num_chn
= orig
;
1134 if (netvsc_attach(net
, device_info
))
1135 netdev_err(net
, "restoring channel setting failed\n");
1139 netvsc_devinfo_put(device_info
);
1143 static void netvsc_init_settings(struct net_device
*dev
)
1145 struct net_device_context
*ndc
= netdev_priv(dev
);
1147 ndc
->l4_hash
= HV_DEFAULT_L4HASH
;
1149 ndc
->speed
= SPEED_UNKNOWN
;
1150 ndc
->duplex
= DUPLEX_FULL
;
1152 dev
->features
= NETIF_F_LRO
;
1155 static int netvsc_get_link_ksettings(struct net_device
*dev
,
1156 struct ethtool_link_ksettings
*cmd
)
1158 struct net_device_context
*ndc
= netdev_priv(dev
);
1159 struct net_device
*vf_netdev
;
1161 vf_netdev
= rtnl_dereference(ndc
->vf_netdev
);
1164 return __ethtool_get_link_ksettings(vf_netdev
, cmd
);
1166 cmd
->base
.speed
= ndc
->speed
;
1167 cmd
->base
.duplex
= ndc
->duplex
;
1168 cmd
->base
.port
= PORT_OTHER
;
1173 static int netvsc_set_link_ksettings(struct net_device
*dev
,
1174 const struct ethtool_link_ksettings
*cmd
)
1176 struct net_device_context
*ndc
= netdev_priv(dev
);
1177 struct net_device
*vf_netdev
= rtnl_dereference(ndc
->vf_netdev
);
1180 if (!vf_netdev
->ethtool_ops
->set_link_ksettings
)
1183 return vf_netdev
->ethtool_ops
->set_link_ksettings(vf_netdev
,
1187 return ethtool_virtdev_set_link_ksettings(dev
, cmd
,
1188 &ndc
->speed
, &ndc
->duplex
);
1191 static int netvsc_change_mtu(struct net_device
*ndev
, int mtu
)
1193 struct net_device_context
*ndevctx
= netdev_priv(ndev
);
1194 struct net_device
*vf_netdev
= rtnl_dereference(ndevctx
->vf_netdev
);
1195 struct netvsc_device
*nvdev
= rtnl_dereference(ndevctx
->nvdev
);
1196 int orig_mtu
= ndev
->mtu
;
1197 struct netvsc_device_info
*device_info
;
1200 if (!nvdev
|| nvdev
->destroy
)
1203 device_info
= netvsc_devinfo_get(nvdev
);
1208 /* Change MTU of underlying VF netdev first. */
1210 ret
= dev_set_mtu(vf_netdev
, mtu
);
1215 ret
= netvsc_detach(ndev
, nvdev
);
1221 ret
= netvsc_attach(ndev
, device_info
);
1225 /* Attempt rollback to original MTU */
1226 ndev
->mtu
= orig_mtu
;
1228 if (netvsc_attach(ndev
, device_info
))
1229 netdev_err(ndev
, "restoring mtu failed\n");
1232 dev_set_mtu(vf_netdev
, orig_mtu
);
1235 netvsc_devinfo_put(device_info
);
1239 static void netvsc_get_vf_stats(struct net_device
*net
,
1240 struct netvsc_vf_pcpu_stats
*tot
)
1242 struct net_device_context
*ndev_ctx
= netdev_priv(net
);
1245 memset(tot
, 0, sizeof(*tot
));
1247 for_each_possible_cpu(i
) {
1248 const struct netvsc_vf_pcpu_stats
*stats
1249 = per_cpu_ptr(ndev_ctx
->vf_stats
, i
);
1250 u64 rx_packets
, rx_bytes
, tx_packets
, tx_bytes
;
1254 start
= u64_stats_fetch_begin_irq(&stats
->syncp
);
1255 rx_packets
= stats
->rx_packets
;
1256 tx_packets
= stats
->tx_packets
;
1257 rx_bytes
= stats
->rx_bytes
;
1258 tx_bytes
= stats
->tx_bytes
;
1259 } while (u64_stats_fetch_retry_irq(&stats
->syncp
, start
));
1261 tot
->rx_packets
+= rx_packets
;
1262 tot
->tx_packets
+= tx_packets
;
1263 tot
->rx_bytes
+= rx_bytes
;
1264 tot
->tx_bytes
+= tx_bytes
;
1265 tot
->tx_dropped
+= stats
->tx_dropped
;
1269 static void netvsc_get_pcpu_stats(struct net_device
*net
,
1270 struct netvsc_ethtool_pcpu_stats
*pcpu_tot
)
1272 struct net_device_context
*ndev_ctx
= netdev_priv(net
);
1273 struct netvsc_device
*nvdev
= rcu_dereference_rtnl(ndev_ctx
->nvdev
);
1276 /* fetch percpu stats of vf */
1277 for_each_possible_cpu(i
) {
1278 const struct netvsc_vf_pcpu_stats
*stats
=
1279 per_cpu_ptr(ndev_ctx
->vf_stats
, i
);
1280 struct netvsc_ethtool_pcpu_stats
*this_tot
= &pcpu_tot
[i
];
1284 start
= u64_stats_fetch_begin_irq(&stats
->syncp
);
1285 this_tot
->vf_rx_packets
= stats
->rx_packets
;
1286 this_tot
->vf_tx_packets
= stats
->tx_packets
;
1287 this_tot
->vf_rx_bytes
= stats
->rx_bytes
;
1288 this_tot
->vf_tx_bytes
= stats
->tx_bytes
;
1289 } while (u64_stats_fetch_retry_irq(&stats
->syncp
, start
));
1290 this_tot
->rx_packets
= this_tot
->vf_rx_packets
;
1291 this_tot
->tx_packets
= this_tot
->vf_tx_packets
;
1292 this_tot
->rx_bytes
= this_tot
->vf_rx_bytes
;
1293 this_tot
->tx_bytes
= this_tot
->vf_tx_bytes
;
1296 /* fetch percpu stats of netvsc */
1297 for (i
= 0; i
< nvdev
->num_chn
; i
++) {
1298 const struct netvsc_channel
*nvchan
= &nvdev
->chan_table
[i
];
1299 const struct netvsc_stats
*stats
;
1300 struct netvsc_ethtool_pcpu_stats
*this_tot
=
1301 &pcpu_tot
[nvchan
->channel
->target_cpu
];
1305 stats
= &nvchan
->tx_stats
;
1307 start
= u64_stats_fetch_begin_irq(&stats
->syncp
);
1308 packets
= stats
->packets
;
1309 bytes
= stats
->bytes
;
1310 } while (u64_stats_fetch_retry_irq(&stats
->syncp
, start
));
1312 this_tot
->tx_bytes
+= bytes
;
1313 this_tot
->tx_packets
+= packets
;
1315 stats
= &nvchan
->rx_stats
;
1317 start
= u64_stats_fetch_begin_irq(&stats
->syncp
);
1318 packets
= stats
->packets
;
1319 bytes
= stats
->bytes
;
1320 } while (u64_stats_fetch_retry_irq(&stats
->syncp
, start
));
1322 this_tot
->rx_bytes
+= bytes
;
1323 this_tot
->rx_packets
+= packets
;
1327 static void netvsc_get_stats64(struct net_device
*net
,
1328 struct rtnl_link_stats64
*t
)
1330 struct net_device_context
*ndev_ctx
= netdev_priv(net
);
1331 struct netvsc_device
*nvdev
;
1332 struct netvsc_vf_pcpu_stats vf_tot
;
1337 nvdev
= rcu_dereference(ndev_ctx
->nvdev
);
1341 netdev_stats_to_stats64(t
, &net
->stats
);
1343 netvsc_get_vf_stats(net
, &vf_tot
);
1344 t
->rx_packets
+= vf_tot
.rx_packets
;
1345 t
->tx_packets
+= vf_tot
.tx_packets
;
1346 t
->rx_bytes
+= vf_tot
.rx_bytes
;
1347 t
->tx_bytes
+= vf_tot
.tx_bytes
;
1348 t
->tx_dropped
+= vf_tot
.tx_dropped
;
1350 for (i
= 0; i
< nvdev
->num_chn
; i
++) {
1351 const struct netvsc_channel
*nvchan
= &nvdev
->chan_table
[i
];
1352 const struct netvsc_stats
*stats
;
1353 u64 packets
, bytes
, multicast
;
1356 stats
= &nvchan
->tx_stats
;
1358 start
= u64_stats_fetch_begin_irq(&stats
->syncp
);
1359 packets
= stats
->packets
;
1360 bytes
= stats
->bytes
;
1361 } while (u64_stats_fetch_retry_irq(&stats
->syncp
, start
));
1363 t
->tx_bytes
+= bytes
;
1364 t
->tx_packets
+= packets
;
1366 stats
= &nvchan
->rx_stats
;
1368 start
= u64_stats_fetch_begin_irq(&stats
->syncp
);
1369 packets
= stats
->packets
;
1370 bytes
= stats
->bytes
;
1371 multicast
= stats
->multicast
+ stats
->broadcast
;
1372 } while (u64_stats_fetch_retry_irq(&stats
->syncp
, start
));
1374 t
->rx_bytes
+= bytes
;
1375 t
->rx_packets
+= packets
;
1376 t
->multicast
+= multicast
;
1382 static int netvsc_set_mac_addr(struct net_device
*ndev
, void *p
)
1384 struct net_device_context
*ndc
= netdev_priv(ndev
);
1385 struct net_device
*vf_netdev
= rtnl_dereference(ndc
->vf_netdev
);
1386 struct netvsc_device
*nvdev
= rtnl_dereference(ndc
->nvdev
);
1387 struct sockaddr
*addr
= p
;
1390 err
= eth_prepare_mac_addr_change(ndev
, p
);
1398 err
= dev_set_mac_address(vf_netdev
, addr
, NULL
);
1403 err
= rndis_filter_set_device_mac(nvdev
, addr
->sa_data
);
1405 eth_commit_mac_addr_change(ndev
, p
);
1406 } else if (vf_netdev
) {
1407 /* rollback change on VF */
1408 memcpy(addr
->sa_data
, ndev
->dev_addr
, ETH_ALEN
);
1409 dev_set_mac_address(vf_netdev
, addr
, NULL
);
1415 static const struct {
1416 char name
[ETH_GSTRING_LEN
];
1418 } netvsc_stats
[] = {
1419 { "tx_scattered", offsetof(struct netvsc_ethtool_stats
, tx_scattered
) },
1420 { "tx_no_memory", offsetof(struct netvsc_ethtool_stats
, tx_no_memory
) },
1421 { "tx_no_space", offsetof(struct netvsc_ethtool_stats
, tx_no_space
) },
1422 { "tx_too_big", offsetof(struct netvsc_ethtool_stats
, tx_too_big
) },
1423 { "tx_busy", offsetof(struct netvsc_ethtool_stats
, tx_busy
) },
1424 { "tx_send_full", offsetof(struct netvsc_ethtool_stats
, tx_send_full
) },
1425 { "rx_comp_busy", offsetof(struct netvsc_ethtool_stats
, rx_comp_busy
) },
1426 { "rx_no_memory", offsetof(struct netvsc_ethtool_stats
, rx_no_memory
) },
1427 { "stop_queue", offsetof(struct netvsc_ethtool_stats
, stop_queue
) },
1428 { "wake_queue", offsetof(struct netvsc_ethtool_stats
, wake_queue
) },
1430 { "cpu%u_rx_packets",
1431 offsetof(struct netvsc_ethtool_pcpu_stats
, rx_packets
) },
1433 offsetof(struct netvsc_ethtool_pcpu_stats
, rx_bytes
) },
1434 { "cpu%u_tx_packets",
1435 offsetof(struct netvsc_ethtool_pcpu_stats
, tx_packets
) },
1437 offsetof(struct netvsc_ethtool_pcpu_stats
, tx_bytes
) },
1438 { "cpu%u_vf_rx_packets",
1439 offsetof(struct netvsc_ethtool_pcpu_stats
, vf_rx_packets
) },
1440 { "cpu%u_vf_rx_bytes",
1441 offsetof(struct netvsc_ethtool_pcpu_stats
, vf_rx_bytes
) },
1442 { "cpu%u_vf_tx_packets",
1443 offsetof(struct netvsc_ethtool_pcpu_stats
, vf_tx_packets
) },
1444 { "cpu%u_vf_tx_bytes",
1445 offsetof(struct netvsc_ethtool_pcpu_stats
, vf_tx_bytes
) },
1447 { "vf_rx_packets", offsetof(struct netvsc_vf_pcpu_stats
, rx_packets
) },
1448 { "vf_rx_bytes", offsetof(struct netvsc_vf_pcpu_stats
, rx_bytes
) },
1449 { "vf_tx_packets", offsetof(struct netvsc_vf_pcpu_stats
, tx_packets
) },
1450 { "vf_tx_bytes", offsetof(struct netvsc_vf_pcpu_stats
, tx_bytes
) },
1451 { "vf_tx_dropped", offsetof(struct netvsc_vf_pcpu_stats
, tx_dropped
) },
1454 #define NETVSC_GLOBAL_STATS_LEN ARRAY_SIZE(netvsc_stats)
1455 #define NETVSC_VF_STATS_LEN ARRAY_SIZE(vf_stats)
1457 /* statistics per queue (rx/tx packets/bytes) */
1458 #define NETVSC_PCPU_STATS_LEN (num_present_cpus() * ARRAY_SIZE(pcpu_stats))
1460 /* 5 statistics per queue (rx/tx packets/bytes, rx xdp_drop) */
1461 #define NETVSC_QUEUE_STATS_LEN(dev) ((dev)->num_chn * 5)
1463 static int netvsc_get_sset_count(struct net_device
*dev
, int string_set
)
1465 struct net_device_context
*ndc
= netdev_priv(dev
);
1466 struct netvsc_device
*nvdev
= rtnl_dereference(ndc
->nvdev
);
1471 switch (string_set
) {
1473 return NETVSC_GLOBAL_STATS_LEN
1474 + NETVSC_VF_STATS_LEN
1475 + NETVSC_QUEUE_STATS_LEN(nvdev
)
1476 + NETVSC_PCPU_STATS_LEN
;
1482 static void netvsc_get_ethtool_stats(struct net_device
*dev
,
1483 struct ethtool_stats
*stats
, u64
*data
)
1485 struct net_device_context
*ndc
= netdev_priv(dev
);
1486 struct netvsc_device
*nvdev
= rtnl_dereference(ndc
->nvdev
);
1487 const void *nds
= &ndc
->eth_stats
;
1488 const struct netvsc_stats
*qstats
;
1489 struct netvsc_vf_pcpu_stats sum
;
1490 struct netvsc_ethtool_pcpu_stats
*pcpu_sum
;
1499 for (i
= 0; i
< NETVSC_GLOBAL_STATS_LEN
; i
++)
1500 data
[i
] = *(unsigned long *)(nds
+ netvsc_stats
[i
].offset
);
1502 netvsc_get_vf_stats(dev
, &sum
);
1503 for (j
= 0; j
< NETVSC_VF_STATS_LEN
; j
++)
1504 data
[i
++] = *(u64
*)((void *)&sum
+ vf_stats
[j
].offset
);
1506 for (j
= 0; j
< nvdev
->num_chn
; j
++) {
1507 qstats
= &nvdev
->chan_table
[j
].tx_stats
;
1510 start
= u64_stats_fetch_begin_irq(&qstats
->syncp
);
1511 packets
= qstats
->packets
;
1512 bytes
= qstats
->bytes
;
1513 } while (u64_stats_fetch_retry_irq(&qstats
->syncp
, start
));
1514 data
[i
++] = packets
;
1517 qstats
= &nvdev
->chan_table
[j
].rx_stats
;
1519 start
= u64_stats_fetch_begin_irq(&qstats
->syncp
);
1520 packets
= qstats
->packets
;
1521 bytes
= qstats
->bytes
;
1522 xdp_drop
= qstats
->xdp_drop
;
1523 } while (u64_stats_fetch_retry_irq(&qstats
->syncp
, start
));
1524 data
[i
++] = packets
;
1526 data
[i
++] = xdp_drop
;
1529 pcpu_sum
= kvmalloc_array(num_possible_cpus(),
1530 sizeof(struct netvsc_ethtool_pcpu_stats
),
1532 netvsc_get_pcpu_stats(dev
, pcpu_sum
);
1533 for_each_present_cpu(cpu
) {
1534 struct netvsc_ethtool_pcpu_stats
*this_sum
= &pcpu_sum
[cpu
];
1536 for (j
= 0; j
< ARRAY_SIZE(pcpu_stats
); j
++)
1537 data
[i
++] = *(u64
*)((void *)this_sum
1538 + pcpu_stats
[j
].offset
);
1543 static void netvsc_get_strings(struct net_device
*dev
, u32 stringset
, u8
*data
)
1545 struct net_device_context
*ndc
= netdev_priv(dev
);
1546 struct netvsc_device
*nvdev
= rtnl_dereference(ndc
->nvdev
);
1553 switch (stringset
) {
1555 for (i
= 0; i
< ARRAY_SIZE(netvsc_stats
); i
++) {
1556 memcpy(p
, netvsc_stats
[i
].name
, ETH_GSTRING_LEN
);
1557 p
+= ETH_GSTRING_LEN
;
1560 for (i
= 0; i
< ARRAY_SIZE(vf_stats
); i
++) {
1561 memcpy(p
, vf_stats
[i
].name
, ETH_GSTRING_LEN
);
1562 p
+= ETH_GSTRING_LEN
;
1565 for (i
= 0; i
< nvdev
->num_chn
; i
++) {
1566 sprintf(p
, "tx_queue_%u_packets", i
);
1567 p
+= ETH_GSTRING_LEN
;
1568 sprintf(p
, "tx_queue_%u_bytes", i
);
1569 p
+= ETH_GSTRING_LEN
;
1570 sprintf(p
, "rx_queue_%u_packets", i
);
1571 p
+= ETH_GSTRING_LEN
;
1572 sprintf(p
, "rx_queue_%u_bytes", i
);
1573 p
+= ETH_GSTRING_LEN
;
1574 sprintf(p
, "rx_queue_%u_xdp_drop", i
);
1575 p
+= ETH_GSTRING_LEN
;
1578 for_each_present_cpu(cpu
) {
1579 for (i
= 0; i
< ARRAY_SIZE(pcpu_stats
); i
++) {
1580 sprintf(p
, pcpu_stats
[i
].name
, cpu
);
1581 p
+= ETH_GSTRING_LEN
;
1590 netvsc_get_rss_hash_opts(struct net_device_context
*ndc
,
1591 struct ethtool_rxnfc
*info
)
1593 const u32 l4_flag
= RXH_L4_B_0_1
| RXH_L4_B_2_3
;
1595 info
->data
= RXH_IP_SRC
| RXH_IP_DST
;
1597 switch (info
->flow_type
) {
1599 if (ndc
->l4_hash
& HV_TCP4_L4HASH
)
1600 info
->data
|= l4_flag
;
1605 if (ndc
->l4_hash
& HV_TCP6_L4HASH
)
1606 info
->data
|= l4_flag
;
1611 if (ndc
->l4_hash
& HV_UDP4_L4HASH
)
1612 info
->data
|= l4_flag
;
1617 if (ndc
->l4_hash
& HV_UDP6_L4HASH
)
1618 info
->data
|= l4_flag
;
1634 netvsc_get_rxnfc(struct net_device
*dev
, struct ethtool_rxnfc
*info
,
1637 struct net_device_context
*ndc
= netdev_priv(dev
);
1638 struct netvsc_device
*nvdev
= rtnl_dereference(ndc
->nvdev
);
1643 switch (info
->cmd
) {
1644 case ETHTOOL_GRXRINGS
:
1645 info
->data
= nvdev
->num_chn
;
1649 return netvsc_get_rss_hash_opts(ndc
, info
);
1654 static int netvsc_set_rss_hash_opts(struct net_device_context
*ndc
,
1655 struct ethtool_rxnfc
*info
)
1657 if (info
->data
== (RXH_IP_SRC
| RXH_IP_DST
|
1658 RXH_L4_B_0_1
| RXH_L4_B_2_3
)) {
1659 switch (info
->flow_type
) {
1661 ndc
->l4_hash
|= HV_TCP4_L4HASH
;
1665 ndc
->l4_hash
|= HV_TCP6_L4HASH
;
1669 ndc
->l4_hash
|= HV_UDP4_L4HASH
;
1673 ndc
->l4_hash
|= HV_UDP6_L4HASH
;
1683 if (info
->data
== (RXH_IP_SRC
| RXH_IP_DST
)) {
1684 switch (info
->flow_type
) {
1686 ndc
->l4_hash
&= ~HV_TCP4_L4HASH
;
1690 ndc
->l4_hash
&= ~HV_TCP6_L4HASH
;
1694 ndc
->l4_hash
&= ~HV_UDP4_L4HASH
;
1698 ndc
->l4_hash
&= ~HV_UDP6_L4HASH
;
1712 netvsc_set_rxnfc(struct net_device
*ndev
, struct ethtool_rxnfc
*info
)
1714 struct net_device_context
*ndc
= netdev_priv(ndev
);
1716 if (info
->cmd
== ETHTOOL_SRXFH
)
1717 return netvsc_set_rss_hash_opts(ndc
, info
);
1722 static u32
netvsc_get_rxfh_key_size(struct net_device
*dev
)
1724 return NETVSC_HASH_KEYLEN
;
1727 static u32
netvsc_rss_indir_size(struct net_device
*dev
)
1732 static int netvsc_get_rxfh(struct net_device
*dev
, u32
*indir
, u8
*key
,
1735 struct net_device_context
*ndc
= netdev_priv(dev
);
1736 struct netvsc_device
*ndev
= rtnl_dereference(ndc
->nvdev
);
1737 struct rndis_device
*rndis_dev
;
1744 *hfunc
= ETH_RSS_HASH_TOP
; /* Toeplitz */
1746 rndis_dev
= ndev
->extension
;
1748 for (i
= 0; i
< ITAB_NUM
; i
++)
1749 indir
[i
] = ndc
->rx_table
[i
];
1753 memcpy(key
, rndis_dev
->rss_key
, NETVSC_HASH_KEYLEN
);
1758 static int netvsc_set_rxfh(struct net_device
*dev
, const u32
*indir
,
1759 const u8
*key
, const u8 hfunc
)
1761 struct net_device_context
*ndc
= netdev_priv(dev
);
1762 struct netvsc_device
*ndev
= rtnl_dereference(ndc
->nvdev
);
1763 struct rndis_device
*rndis_dev
;
1769 if (hfunc
!= ETH_RSS_HASH_NO_CHANGE
&& hfunc
!= ETH_RSS_HASH_TOP
)
1772 rndis_dev
= ndev
->extension
;
1774 for (i
= 0; i
< ITAB_NUM
; i
++)
1775 if (indir
[i
] >= ndev
->num_chn
)
1778 for (i
= 0; i
< ITAB_NUM
; i
++)
1779 ndc
->rx_table
[i
] = indir
[i
];
1786 key
= rndis_dev
->rss_key
;
1789 return rndis_filter_set_rss_param(rndis_dev
, key
);
1792 /* Hyper-V RNDIS protocol does not have ring in the HW sense.
1793 * It does have pre-allocated receive area which is divided into sections.
1795 static void __netvsc_get_ringparam(struct netvsc_device
*nvdev
,
1796 struct ethtool_ringparam
*ring
)
1800 ring
->rx_pending
= nvdev
->recv_section_cnt
;
1801 ring
->tx_pending
= nvdev
->send_section_cnt
;
1803 if (nvdev
->nvsp_version
<= NVSP_PROTOCOL_VERSION_2
)
1804 max_buf_size
= NETVSC_RECEIVE_BUFFER_SIZE_LEGACY
;
1806 max_buf_size
= NETVSC_RECEIVE_BUFFER_SIZE
;
1808 ring
->rx_max_pending
= max_buf_size
/ nvdev
->recv_section_size
;
1809 ring
->tx_max_pending
= NETVSC_SEND_BUFFER_SIZE
1810 / nvdev
->send_section_size
;
1813 static void netvsc_get_ringparam(struct net_device
*ndev
,
1814 struct ethtool_ringparam
*ring
)
1816 struct net_device_context
*ndevctx
= netdev_priv(ndev
);
1817 struct netvsc_device
*nvdev
= rtnl_dereference(ndevctx
->nvdev
);
1822 __netvsc_get_ringparam(nvdev
, ring
);
1825 static int netvsc_set_ringparam(struct net_device
*ndev
,
1826 struct ethtool_ringparam
*ring
)
1828 struct net_device_context
*ndevctx
= netdev_priv(ndev
);
1829 struct netvsc_device
*nvdev
= rtnl_dereference(ndevctx
->nvdev
);
1830 struct netvsc_device_info
*device_info
;
1831 struct ethtool_ringparam orig
;
1835 if (!nvdev
|| nvdev
->destroy
)
1838 memset(&orig
, 0, sizeof(orig
));
1839 __netvsc_get_ringparam(nvdev
, &orig
);
1841 new_tx
= clamp_t(u32
, ring
->tx_pending
,
1842 NETVSC_MIN_TX_SECTIONS
, orig
.tx_max_pending
);
1843 new_rx
= clamp_t(u32
, ring
->rx_pending
,
1844 NETVSC_MIN_RX_SECTIONS
, orig
.rx_max_pending
);
1846 if (new_tx
== orig
.tx_pending
&&
1847 new_rx
== orig
.rx_pending
)
1848 return 0; /* no change */
1850 device_info
= netvsc_devinfo_get(nvdev
);
1855 device_info
->send_sections
= new_tx
;
1856 device_info
->recv_sections
= new_rx
;
1858 ret
= netvsc_detach(ndev
, nvdev
);
1862 ret
= netvsc_attach(ndev
, device_info
);
1864 device_info
->send_sections
= orig
.tx_pending
;
1865 device_info
->recv_sections
= orig
.rx_pending
;
1867 if (netvsc_attach(ndev
, device_info
))
1868 netdev_err(ndev
, "restoring ringparam failed");
1872 netvsc_devinfo_put(device_info
);
1876 static netdev_features_t
netvsc_fix_features(struct net_device
*ndev
,
1877 netdev_features_t features
)
1879 struct net_device_context
*ndevctx
= netdev_priv(ndev
);
1880 struct netvsc_device
*nvdev
= rtnl_dereference(ndevctx
->nvdev
);
1882 if (!nvdev
|| nvdev
->destroy
)
1885 if ((features
& NETIF_F_LRO
) && netvsc_xdp_get(nvdev
)) {
1886 features
^= NETIF_F_LRO
;
1887 netdev_info(ndev
, "Skip LRO - unsupported with XDP\n");
1893 static int netvsc_set_features(struct net_device
*ndev
,
1894 netdev_features_t features
)
1896 netdev_features_t change
= features
^ ndev
->features
;
1897 struct net_device_context
*ndevctx
= netdev_priv(ndev
);
1898 struct netvsc_device
*nvdev
= rtnl_dereference(ndevctx
->nvdev
);
1899 struct net_device
*vf_netdev
= rtnl_dereference(ndevctx
->vf_netdev
);
1900 struct ndis_offload_params offloads
;
1903 if (!nvdev
|| nvdev
->destroy
)
1906 if (!(change
& NETIF_F_LRO
))
1909 memset(&offloads
, 0, sizeof(struct ndis_offload_params
));
1911 if (features
& NETIF_F_LRO
) {
1912 offloads
.rsc_ip_v4
= NDIS_OFFLOAD_PARAMETERS_RSC_ENABLED
;
1913 offloads
.rsc_ip_v6
= NDIS_OFFLOAD_PARAMETERS_RSC_ENABLED
;
1915 offloads
.rsc_ip_v4
= NDIS_OFFLOAD_PARAMETERS_RSC_DISABLED
;
1916 offloads
.rsc_ip_v6
= NDIS_OFFLOAD_PARAMETERS_RSC_DISABLED
;
1919 ret
= rndis_filter_set_offload_params(ndev
, nvdev
, &offloads
);
1922 features
^= NETIF_F_LRO
;
1923 ndev
->features
= features
;
1930 vf_netdev
->wanted_features
= features
;
1931 netdev_update_features(vf_netdev
);
1936 static u32
netvsc_get_msglevel(struct net_device
*ndev
)
1938 struct net_device_context
*ndev_ctx
= netdev_priv(ndev
);
1940 return ndev_ctx
->msg_enable
;
1943 static void netvsc_set_msglevel(struct net_device
*ndev
, u32 val
)
1945 struct net_device_context
*ndev_ctx
= netdev_priv(ndev
);
1947 ndev_ctx
->msg_enable
= val
;
1950 static const struct ethtool_ops ethtool_ops
= {
1951 .get_drvinfo
= netvsc_get_drvinfo
,
1952 .get_msglevel
= netvsc_get_msglevel
,
1953 .set_msglevel
= netvsc_set_msglevel
,
1954 .get_link
= ethtool_op_get_link
,
1955 .get_ethtool_stats
= netvsc_get_ethtool_stats
,
1956 .get_sset_count
= netvsc_get_sset_count
,
1957 .get_strings
= netvsc_get_strings
,
1958 .get_channels
= netvsc_get_channels
,
1959 .set_channels
= netvsc_set_channels
,
1960 .get_ts_info
= ethtool_op_get_ts_info
,
1961 .get_rxnfc
= netvsc_get_rxnfc
,
1962 .set_rxnfc
= netvsc_set_rxnfc
,
1963 .get_rxfh_key_size
= netvsc_get_rxfh_key_size
,
1964 .get_rxfh_indir_size
= netvsc_rss_indir_size
,
1965 .get_rxfh
= netvsc_get_rxfh
,
1966 .set_rxfh
= netvsc_set_rxfh
,
1967 .get_link_ksettings
= netvsc_get_link_ksettings
,
1968 .set_link_ksettings
= netvsc_set_link_ksettings
,
1969 .get_ringparam
= netvsc_get_ringparam
,
1970 .set_ringparam
= netvsc_set_ringparam
,
1973 static const struct net_device_ops device_ops
= {
1974 .ndo_open
= netvsc_open
,
1975 .ndo_stop
= netvsc_close
,
1976 .ndo_start_xmit
= netvsc_start_xmit
,
1977 .ndo_change_rx_flags
= netvsc_change_rx_flags
,
1978 .ndo_set_rx_mode
= netvsc_set_rx_mode
,
1979 .ndo_fix_features
= netvsc_fix_features
,
1980 .ndo_set_features
= netvsc_set_features
,
1981 .ndo_change_mtu
= netvsc_change_mtu
,
1982 .ndo_validate_addr
= eth_validate_addr
,
1983 .ndo_set_mac_address
= netvsc_set_mac_addr
,
1984 .ndo_select_queue
= netvsc_select_queue
,
1985 .ndo_get_stats64
= netvsc_get_stats64
,
1986 .ndo_bpf
= netvsc_bpf
,
1990 * Handle link status changes. For RNDIS_STATUS_NETWORK_CHANGE emulate link
1991 * down/up sequence. In case of RNDIS_STATUS_MEDIA_CONNECT when carrier is
1992 * present send GARP packet to network peers with netif_notify_peers().
1994 static void netvsc_link_change(struct work_struct
*w
)
1996 struct net_device_context
*ndev_ctx
=
1997 container_of(w
, struct net_device_context
, dwork
.work
);
1998 struct hv_device
*device_obj
= ndev_ctx
->device_ctx
;
1999 struct net_device
*net
= hv_get_drvdata(device_obj
);
2000 struct netvsc_device
*net_device
;
2001 struct rndis_device
*rdev
;
2002 struct netvsc_reconfig
*event
= NULL
;
2003 bool notify
= false, reschedule
= false;
2004 unsigned long flags
, next_reconfig
, delay
;
2006 /* if changes are happening, comeback later */
2007 if (!rtnl_trylock()) {
2008 schedule_delayed_work(&ndev_ctx
->dwork
, LINKCHANGE_INT
);
2012 net_device
= rtnl_dereference(ndev_ctx
->nvdev
);
2016 rdev
= net_device
->extension
;
2018 next_reconfig
= ndev_ctx
->last_reconfig
+ LINKCHANGE_INT
;
2019 if (time_is_after_jiffies(next_reconfig
)) {
2020 /* link_watch only sends one notification with current state
2021 * per second, avoid doing reconfig more frequently. Handle
2024 delay
= next_reconfig
- jiffies
;
2025 delay
= delay
< LINKCHANGE_INT
? delay
: LINKCHANGE_INT
;
2026 schedule_delayed_work(&ndev_ctx
->dwork
, delay
);
2029 ndev_ctx
->last_reconfig
= jiffies
;
2031 spin_lock_irqsave(&ndev_ctx
->lock
, flags
);
2032 if (!list_empty(&ndev_ctx
->reconfig_events
)) {
2033 event
= list_first_entry(&ndev_ctx
->reconfig_events
,
2034 struct netvsc_reconfig
, list
);
2035 list_del(&event
->list
);
2036 reschedule
= !list_empty(&ndev_ctx
->reconfig_events
);
2038 spin_unlock_irqrestore(&ndev_ctx
->lock
, flags
);
2043 switch (event
->event
) {
2044 /* Only the following events are possible due to the check in
2045 * netvsc_linkstatus_callback()
2047 case RNDIS_STATUS_MEDIA_CONNECT
:
2048 if (rdev
->link_state
) {
2049 rdev
->link_state
= false;
2050 netif_carrier_on(net
);
2051 netvsc_tx_enable(net_device
, net
);
2057 case RNDIS_STATUS_MEDIA_DISCONNECT
:
2058 if (!rdev
->link_state
) {
2059 rdev
->link_state
= true;
2060 netif_carrier_off(net
);
2061 netvsc_tx_disable(net_device
, net
);
2065 case RNDIS_STATUS_NETWORK_CHANGE
:
2066 /* Only makes sense if carrier is present */
2067 if (!rdev
->link_state
) {
2068 rdev
->link_state
= true;
2069 netif_carrier_off(net
);
2070 netvsc_tx_disable(net_device
, net
);
2071 event
->event
= RNDIS_STATUS_MEDIA_CONNECT
;
2072 spin_lock_irqsave(&ndev_ctx
->lock
, flags
);
2073 list_add(&event
->list
, &ndev_ctx
->reconfig_events
);
2074 spin_unlock_irqrestore(&ndev_ctx
->lock
, flags
);
2083 netdev_notify_peers(net
);
2085 /* link_watch only sends one notification with current state per
2086 * second, handle next reconfig event in 2 seconds.
2089 schedule_delayed_work(&ndev_ctx
->dwork
, LINKCHANGE_INT
);
2097 static struct net_device
*get_netvsc_byref(struct net_device
*vf_netdev
)
2099 struct net_device_context
*net_device_ctx
;
2100 struct net_device
*dev
;
2102 dev
= netdev_master_upper_dev_get(vf_netdev
);
2103 if (!dev
|| dev
->netdev_ops
!= &device_ops
)
2104 return NULL
; /* not a netvsc device */
2106 net_device_ctx
= netdev_priv(dev
);
2107 if (!rtnl_dereference(net_device_ctx
->nvdev
))
2108 return NULL
; /* device is removed */
2113 /* Called when VF is injecting data into network stack.
2114 * Change the associated network device from VF to netvsc.
2115 * note: already called with rcu_read_lock
2117 static rx_handler_result_t
netvsc_vf_handle_frame(struct sk_buff
**pskb
)
2119 struct sk_buff
*skb
= *pskb
;
2120 struct net_device
*ndev
= rcu_dereference(skb
->dev
->rx_handler_data
);
2121 struct net_device_context
*ndev_ctx
= netdev_priv(ndev
);
2122 struct netvsc_vf_pcpu_stats
*pcpu_stats
2123 = this_cpu_ptr(ndev_ctx
->vf_stats
);
2125 skb
= skb_share_check(skb
, GFP_ATOMIC
);
2127 return RX_HANDLER_CONSUMED
;
2133 u64_stats_update_begin(&pcpu_stats
->syncp
);
2134 pcpu_stats
->rx_packets
++;
2135 pcpu_stats
->rx_bytes
+= skb
->len
;
2136 u64_stats_update_end(&pcpu_stats
->syncp
);
2138 return RX_HANDLER_ANOTHER
;
2141 static int netvsc_vf_join(struct net_device
*vf_netdev
,
2142 struct net_device
*ndev
)
2144 struct net_device_context
*ndev_ctx
= netdev_priv(ndev
);
2147 ret
= netdev_rx_handler_register(vf_netdev
,
2148 netvsc_vf_handle_frame
, ndev
);
2150 netdev_err(vf_netdev
,
2151 "can not register netvsc VF receive handler (err = %d)\n",
2153 goto rx_handler_failed
;
2156 ret
= netdev_master_upper_dev_link(vf_netdev
, ndev
,
2159 netdev_err(vf_netdev
,
2160 "can not set master device %s (err = %d)\n",
2162 goto upper_link_failed
;
2165 /* set slave flag before open to prevent IPv6 addrconf */
2166 vf_netdev
->flags
|= IFF_SLAVE
;
2168 schedule_delayed_work(&ndev_ctx
->vf_takeover
, VF_TAKEOVER_INT
);
2170 call_netdevice_notifiers(NETDEV_JOIN
, vf_netdev
);
2172 netdev_info(vf_netdev
, "joined to %s\n", ndev
->name
);
2176 netdev_rx_handler_unregister(vf_netdev
);
2181 static void __netvsc_vf_setup(struct net_device
*ndev
,
2182 struct net_device
*vf_netdev
)
2186 /* Align MTU of VF with master */
2187 ret
= dev_set_mtu(vf_netdev
, ndev
->mtu
);
2189 netdev_warn(vf_netdev
,
2190 "unable to change mtu to %u\n", ndev
->mtu
);
2192 /* set multicast etc flags on VF */
2193 dev_change_flags(vf_netdev
, ndev
->flags
| IFF_SLAVE
, NULL
);
2195 /* sync address list from ndev to VF */
2196 netif_addr_lock_bh(ndev
);
2197 dev_uc_sync(vf_netdev
, ndev
);
2198 dev_mc_sync(vf_netdev
, ndev
);
2199 netif_addr_unlock_bh(ndev
);
2201 if (netif_running(ndev
)) {
2202 ret
= dev_open(vf_netdev
, NULL
);
2204 netdev_warn(vf_netdev
,
2205 "unable to open: %d\n", ret
);
2209 /* Setup VF as slave of the synthetic device.
2210 * Runs in workqueue to avoid recursion in netlink callbacks.
2212 static void netvsc_vf_setup(struct work_struct
*w
)
2214 struct net_device_context
*ndev_ctx
2215 = container_of(w
, struct net_device_context
, vf_takeover
.work
);
2216 struct net_device
*ndev
= hv_get_drvdata(ndev_ctx
->device_ctx
);
2217 struct net_device
*vf_netdev
;
2219 if (!rtnl_trylock()) {
2220 schedule_delayed_work(&ndev_ctx
->vf_takeover
, 0);
2224 vf_netdev
= rtnl_dereference(ndev_ctx
->vf_netdev
);
2226 __netvsc_vf_setup(ndev
, vf_netdev
);
2231 /* Find netvsc by VF serial number.
2232 * The PCI hyperv controller records the serial number as the slot kobj name.
2234 static struct net_device
*get_netvsc_byslot(const struct net_device
*vf_netdev
)
2236 struct device
*parent
= vf_netdev
->dev
.parent
;
2237 struct net_device_context
*ndev_ctx
;
2238 struct pci_dev
*pdev
;
2241 if (!parent
|| !dev_is_pci(parent
))
2242 return NULL
; /* not a PCI device */
2244 pdev
= to_pci_dev(parent
);
2246 netdev_notice(vf_netdev
, "no PCI slot information\n");
2250 if (kstrtou32(pci_slot_name(pdev
->slot
), 10, &serial
)) {
2251 netdev_notice(vf_netdev
, "Invalid vf serial:%s\n",
2252 pci_slot_name(pdev
->slot
));
2256 list_for_each_entry(ndev_ctx
, &netvsc_dev_list
, list
) {
2257 if (!ndev_ctx
->vf_alloc
)
2260 if (ndev_ctx
->vf_serial
== serial
)
2261 return hv_get_drvdata(ndev_ctx
->device_ctx
);
2264 netdev_notice(vf_netdev
,
2265 "no netdev found for vf serial:%u\n", serial
);
2269 static int netvsc_register_vf(struct net_device
*vf_netdev
)
2271 struct net_device_context
*net_device_ctx
;
2272 struct netvsc_device
*netvsc_dev
;
2273 struct bpf_prog
*prog
;
2274 struct net_device
*ndev
;
2277 if (vf_netdev
->addr_len
!= ETH_ALEN
)
2280 ndev
= get_netvsc_byslot(vf_netdev
);
2284 net_device_ctx
= netdev_priv(ndev
);
2285 netvsc_dev
= rtnl_dereference(net_device_ctx
->nvdev
);
2286 if (!netvsc_dev
|| rtnl_dereference(net_device_ctx
->vf_netdev
))
2289 /* if synthetic interface is a different namespace,
2290 * then move the VF to that namespace; join will be
2291 * done again in that context.
2293 if (!net_eq(dev_net(ndev
), dev_net(vf_netdev
))) {
2294 ret
= dev_change_net_namespace(vf_netdev
,
2295 dev_net(ndev
), "eth%d");
2297 netdev_err(vf_netdev
,
2298 "could not move to same namespace as %s: %d\n",
2301 netdev_info(vf_netdev
,
2302 "VF moved to namespace with: %s\n",
2307 netdev_info(ndev
, "VF registering: %s\n", vf_netdev
->name
);
2309 if (netvsc_vf_join(vf_netdev
, ndev
) != 0)
2312 dev_hold(vf_netdev
);
2313 rcu_assign_pointer(net_device_ctx
->vf_netdev
, vf_netdev
);
2315 vf_netdev
->wanted_features
= ndev
->features
;
2316 netdev_update_features(vf_netdev
);
2318 prog
= netvsc_xdp_get(netvsc_dev
);
2319 netvsc_vf_setxdp(vf_netdev
, prog
);
2324 /* VF up/down change detected, schedule to change data path */
2325 static int netvsc_vf_changed(struct net_device
*vf_netdev
)
2327 struct net_device_context
*net_device_ctx
;
2328 struct netvsc_device
*netvsc_dev
;
2329 struct net_device
*ndev
;
2330 bool vf_is_up
= netif_running(vf_netdev
);
2332 ndev
= get_netvsc_byref(vf_netdev
);
2336 net_device_ctx
= netdev_priv(ndev
);
2337 netvsc_dev
= rtnl_dereference(net_device_ctx
->nvdev
);
2341 netvsc_switch_datapath(ndev
, vf_is_up
);
2342 netdev_info(ndev
, "Data path switched %s VF: %s\n",
2343 vf_is_up
? "to" : "from", vf_netdev
->name
);
2348 static int netvsc_unregister_vf(struct net_device
*vf_netdev
)
2350 struct net_device
*ndev
;
2351 struct net_device_context
*net_device_ctx
;
2353 ndev
= get_netvsc_byref(vf_netdev
);
2357 net_device_ctx
= netdev_priv(ndev
);
2358 cancel_delayed_work_sync(&net_device_ctx
->vf_takeover
);
2360 netdev_info(ndev
, "VF unregistering: %s\n", vf_netdev
->name
);
2362 netvsc_vf_setxdp(vf_netdev
, NULL
);
2364 netdev_rx_handler_unregister(vf_netdev
);
2365 netdev_upper_dev_unlink(vf_netdev
, ndev
);
2366 RCU_INIT_POINTER(net_device_ctx
->vf_netdev
, NULL
);
2372 static int netvsc_probe(struct hv_device
*dev
,
2373 const struct hv_vmbus_device_id
*dev_id
)
2375 struct net_device
*net
= NULL
;
2376 struct net_device_context
*net_device_ctx
;
2377 struct netvsc_device_info
*device_info
= NULL
;
2378 struct netvsc_device
*nvdev
;
2381 net
= alloc_etherdev_mq(sizeof(struct net_device_context
),
2386 netif_carrier_off(net
);
2388 netvsc_init_settings(net
);
2390 net_device_ctx
= netdev_priv(net
);
2391 net_device_ctx
->device_ctx
= dev
;
2392 net_device_ctx
->msg_enable
= netif_msg_init(debug
, default_msg
);
2393 if (netif_msg_probe(net_device_ctx
))
2394 netdev_dbg(net
, "netvsc msg_enable: %d\n",
2395 net_device_ctx
->msg_enable
);
2397 hv_set_drvdata(dev
, net
);
2399 INIT_DELAYED_WORK(&net_device_ctx
->dwork
, netvsc_link_change
);
2401 spin_lock_init(&net_device_ctx
->lock
);
2402 INIT_LIST_HEAD(&net_device_ctx
->reconfig_events
);
2403 INIT_DELAYED_WORK(&net_device_ctx
->vf_takeover
, netvsc_vf_setup
);
2405 net_device_ctx
->vf_stats
2406 = netdev_alloc_pcpu_stats(struct netvsc_vf_pcpu_stats
);
2407 if (!net_device_ctx
->vf_stats
)
2410 net
->netdev_ops
= &device_ops
;
2411 net
->ethtool_ops
= ðtool_ops
;
2412 SET_NETDEV_DEV(net
, &dev
->device
);
2414 /* We always need headroom for rndis header */
2415 net
->needed_headroom
= RNDIS_AND_PPI_SIZE
;
2417 /* Initialize the number of queues to be 1, we may change it if more
2418 * channels are offered later.
2420 netif_set_real_num_tx_queues(net
, 1);
2421 netif_set_real_num_rx_queues(net
, 1);
2423 /* Notify the netvsc driver of the new device */
2424 device_info
= netvsc_devinfo_get(NULL
);
2428 goto devinfo_failed
;
2431 nvdev
= rndis_filter_device_add(dev
, device_info
);
2432 if (IS_ERR(nvdev
)) {
2433 ret
= PTR_ERR(nvdev
);
2434 netdev_err(net
, "unable to add netvsc device (ret %d)\n", ret
);
2438 memcpy(net
->dev_addr
, device_info
->mac_adr
, ETH_ALEN
);
2440 /* We must get rtnl lock before scheduling nvdev->subchan_work,
2441 * otherwise netvsc_subchan_work() can get rtnl lock first and wait
2442 * all subchannels to show up, but that may not happen because
2443 * netvsc_probe() can't get rtnl lock and as a result vmbus_onoffer()
2444 * -> ... -> device_add() -> ... -> __device_attach() can't get
2445 * the device lock, so all the subchannels can't be processed --
2446 * finally netvsc_subchan_work() hangs forever.
2450 if (nvdev
->num_chn
> 1)
2451 schedule_work(&nvdev
->subchan_work
);
2453 /* hw_features computed in rndis_netdev_set_hwcaps() */
2454 net
->features
= net
->hw_features
|
2455 NETIF_F_HIGHDMA
| NETIF_F_HW_VLAN_CTAG_TX
|
2456 NETIF_F_HW_VLAN_CTAG_RX
;
2457 net
->vlan_features
= net
->features
;
2459 /* MTU range: 68 - 1500 or 65521 */
2460 net
->min_mtu
= NETVSC_MTU_MIN
;
2461 if (nvdev
->nvsp_version
>= NVSP_PROTOCOL_VERSION_2
)
2462 net
->max_mtu
= NETVSC_MTU
- ETH_HLEN
;
2464 net
->max_mtu
= ETH_DATA_LEN
;
2466 nvdev
->tx_disable
= false;
2468 ret
= register_netdevice(net
);
2470 pr_err("Unable to register netdev.\n");
2471 goto register_failed
;
2474 list_add(&net_device_ctx
->list
, &netvsc_dev_list
);
2477 netvsc_devinfo_put(device_info
);
2482 rndis_filter_device_remove(dev
, nvdev
);
2484 netvsc_devinfo_put(device_info
);
2486 free_percpu(net_device_ctx
->vf_stats
);
2488 hv_set_drvdata(dev
, NULL
);
2494 static int netvsc_remove(struct hv_device
*dev
)
2496 struct net_device_context
*ndev_ctx
;
2497 struct net_device
*vf_netdev
, *net
;
2498 struct netvsc_device
*nvdev
;
2500 net
= hv_get_drvdata(dev
);
2502 dev_err(&dev
->device
, "No net device to remove\n");
2506 ndev_ctx
= netdev_priv(net
);
2508 cancel_delayed_work_sync(&ndev_ctx
->dwork
);
2511 nvdev
= rtnl_dereference(ndev_ctx
->nvdev
);
2513 cancel_work_sync(&nvdev
->subchan_work
);
2514 netvsc_xdp_set(net
, NULL
, NULL
, nvdev
);
2518 * Call to the vsc driver to let it know that the device is being
2519 * removed. Also blocks mtu and channel changes.
2521 vf_netdev
= rtnl_dereference(ndev_ctx
->vf_netdev
);
2523 netvsc_unregister_vf(vf_netdev
);
2526 rndis_filter_device_remove(dev
, nvdev
);
2528 unregister_netdevice(net
);
2529 list_del(&ndev_ctx
->list
);
2533 hv_set_drvdata(dev
, NULL
);
2535 free_percpu(ndev_ctx
->vf_stats
);
2540 static int netvsc_suspend(struct hv_device
*dev
)
2542 struct net_device_context
*ndev_ctx
;
2543 struct net_device
*vf_netdev
, *net
;
2544 struct netvsc_device
*nvdev
;
2547 net
= hv_get_drvdata(dev
);
2549 ndev_ctx
= netdev_priv(net
);
2550 cancel_delayed_work_sync(&ndev_ctx
->dwork
);
2554 nvdev
= rtnl_dereference(ndev_ctx
->nvdev
);
2555 if (nvdev
== NULL
) {
2560 vf_netdev
= rtnl_dereference(ndev_ctx
->vf_netdev
);
2562 netvsc_unregister_vf(vf_netdev
);
2564 /* Save the current config info */
2565 ndev_ctx
->saved_netvsc_dev_info
= netvsc_devinfo_get(nvdev
);
2567 ret
= netvsc_detach(net
, nvdev
);
2574 static int netvsc_resume(struct hv_device
*dev
)
2576 struct net_device
*net
= hv_get_drvdata(dev
);
2577 struct net_device_context
*net_device_ctx
;
2578 struct netvsc_device_info
*device_info
;
2583 net_device_ctx
= netdev_priv(net
);
2584 device_info
= net_device_ctx
->saved_netvsc_dev_info
;
2586 ret
= netvsc_attach(net
, device_info
);
2588 netvsc_devinfo_put(device_info
);
2589 net_device_ctx
->saved_netvsc_dev_info
= NULL
;
2595 static const struct hv_vmbus_device_id id_table
[] = {
2601 MODULE_DEVICE_TABLE(vmbus
, id_table
);
2603 /* The one and only one */
2604 static struct hv_driver netvsc_drv
= {
2605 .name
= KBUILD_MODNAME
,
2606 .id_table
= id_table
,
2607 .probe
= netvsc_probe
,
2608 .remove
= netvsc_remove
,
2609 .suspend
= netvsc_suspend
,
2610 .resume
= netvsc_resume
,
2612 .probe_type
= PROBE_FORCE_SYNCHRONOUS
,
2617 * On Hyper-V, every VF interface is matched with a corresponding
2618 * synthetic interface. The synthetic interface is presented first
2619 * to the guest. When the corresponding VF instance is registered,
2620 * we will take care of switching the data path.
2622 static int netvsc_netdev_event(struct notifier_block
*this,
2623 unsigned long event
, void *ptr
)
2625 struct net_device
*event_dev
= netdev_notifier_info_to_dev(ptr
);
2627 /* Skip our own events */
2628 if (event_dev
->netdev_ops
== &device_ops
)
2631 /* Avoid non-Ethernet type devices */
2632 if (event_dev
->type
!= ARPHRD_ETHER
)
2635 /* Avoid Vlan dev with same MAC registering as VF */
2636 if (is_vlan_dev(event_dev
))
2639 /* Avoid Bonding master dev with same MAC registering as VF */
2640 if ((event_dev
->priv_flags
& IFF_BONDING
) &&
2641 (event_dev
->flags
& IFF_MASTER
))
2645 case NETDEV_REGISTER
:
2646 return netvsc_register_vf(event_dev
);
2647 case NETDEV_UNREGISTER
:
2648 return netvsc_unregister_vf(event_dev
);
2651 return netvsc_vf_changed(event_dev
);
2657 static struct notifier_block netvsc_netdev_notifier
= {
2658 .notifier_call
= netvsc_netdev_event
,
2661 static void __exit
netvsc_drv_exit(void)
2663 unregister_netdevice_notifier(&netvsc_netdev_notifier
);
2664 vmbus_driver_unregister(&netvsc_drv
);
2667 static int __init
netvsc_drv_init(void)
2671 if (ring_size
< RING_SIZE_MIN
) {
2672 ring_size
= RING_SIZE_MIN
;
2673 pr_info("Increased ring_size to %u (min allowed)\n",
2676 netvsc_ring_bytes
= ring_size
* PAGE_SIZE
;
2678 ret
= vmbus_driver_register(&netvsc_drv
);
2682 register_netdevice_notifier(&netvsc_netdev_notifier
);
2686 MODULE_LICENSE("GPL");
2687 MODULE_DESCRIPTION("Microsoft Hyper-V network driver");
2689 module_init(netvsc_drv_init
);
2690 module_exit(netvsc_drv_exit
);