]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/blob - releases/4.4.46/ib-ipoib-move-back-ib-ll-address-into-the-hard-header.patch
Fixes for 4.19
[thirdparty/kernel/stable-queue.git] / releases / 4.4.46 / ib-ipoib-move-back-ib-ll-address-into-the-hard-header.patch
1 From fc791b6335152c5278dc4a4991bcb2d329f806f9 Mon Sep 17 00:00:00 2001
2 From: Paolo Abeni <pabeni@redhat.com>
3 Date: Thu, 13 Oct 2016 18:26:56 +0200
4 Subject: IB/ipoib: move back IB LL address into the hard header
5
6 From: Paolo Abeni <pabeni@redhat.com>
7
8 commit fc791b6335152c5278dc4a4991bcb2d329f806f9 upstream.
9
10 After the commit 9207f9d45b0a ("net: preserve IP control block
11 during GSO segmentation"), the GSO CB and the IPoIB CB conflict.
12 That destroy the IPoIB address information cached there,
13 causing a severe performance regression, as better described here:
14
15 http://marc.info/?l=linux-kernel&m=146787279825501&w=2
16
17 This change moves the data cached by the IPoIB driver from the
18 skb control lock into the IPoIB hard header, as done before
19 the commit 936d7de3d736 ("IPoIB: Stop lying about hard_header_len
20 and use skb->cb to stash LL addresses").
21 In order to avoid GRO issue, on packet reception, the IPoIB driver
22 stash into the skb a dummy pseudo header, so that the received
23 packets have actually a hard header matching the declared length.
24 To avoid changing the connected mode maximum mtu, the allocated
25 head buffer size is increased by the pseudo header length.
26
27 After this commit, IPoIB performances are back to pre-regression
28 value.
29
30 v2 -> v3: rebased
31 v1 -> v2: avoid changing the max mtu, increasing the head buf size
32
33 Fixes: 9207f9d45b0a ("net: preserve IP control block during GSO segmentation")
34 Signed-off-by: Paolo Abeni <pabeni@redhat.com>
35 Signed-off-by: David S. Miller <davem@davemloft.net>
36 Cc: Vasiliy Tolstov <v.tolstov@selfip.ru>
37 Cc: Nikolay Borisov <n.borisov.lkml@gmail.com>
38 Cc: Doug Ledford <dledford@redhat.com>
39 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
40
41 ---
42 drivers/infiniband/ulp/ipoib/ipoib.h | 20 ++++++---
43 drivers/infiniband/ulp/ipoib/ipoib_cm.c | 15 +++---
44 drivers/infiniband/ulp/ipoib/ipoib_ib.c | 12 ++---
45 drivers/infiniband/ulp/ipoib/ipoib_main.c | 54 +++++++++++++++----------
46 drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 6 +-
47 5 files changed, 64 insertions(+), 43 deletions(-)
48
49 --- a/drivers/infiniband/ulp/ipoib/ipoib.h
50 +++ b/drivers/infiniband/ulp/ipoib/ipoib.h
51 @@ -63,6 +63,8 @@ enum ipoib_flush_level {
52
53 enum {
54 IPOIB_ENCAP_LEN = 4,
55 + IPOIB_PSEUDO_LEN = 20,
56 + IPOIB_HARD_LEN = IPOIB_ENCAP_LEN + IPOIB_PSEUDO_LEN,
57
58 IPOIB_UD_HEAD_SIZE = IB_GRH_BYTES + IPOIB_ENCAP_LEN,
59 IPOIB_UD_RX_SG = 2, /* max buffer needed for 4K mtu */
60 @@ -131,15 +133,21 @@ struct ipoib_header {
61 u16 reserved;
62 };
63
64 -struct ipoib_cb {
65 - struct qdisc_skb_cb qdisc_cb;
66 - u8 hwaddr[INFINIBAND_ALEN];
67 +struct ipoib_pseudo_header {
68 + u8 hwaddr[INFINIBAND_ALEN];
69 };
70
71 -static inline struct ipoib_cb *ipoib_skb_cb(const struct sk_buff *skb)
72 +static inline void skb_add_pseudo_hdr(struct sk_buff *skb)
73 {
74 - BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct ipoib_cb));
75 - return (struct ipoib_cb *)skb->cb;
76 + char *data = skb_push(skb, IPOIB_PSEUDO_LEN);
77 +
78 + /*
79 + * only the ipoib header is present now, make room for a dummy
80 + * pseudo header and set skb field accordingly
81 + */
82 + memset(data, 0, IPOIB_PSEUDO_LEN);
83 + skb_reset_mac_header(skb);
84 + skb_pull(skb, IPOIB_HARD_LEN);
85 }
86
87 /* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */
88 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
89 +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
90 @@ -63,6 +63,8 @@ MODULE_PARM_DESC(cm_data_debug_level,
91 #define IPOIB_CM_RX_DELAY (3 * 256 * HZ)
92 #define IPOIB_CM_RX_UPDATE_MASK (0x3)
93
94 +#define IPOIB_CM_RX_RESERVE (ALIGN(IPOIB_HARD_LEN, 16) - IPOIB_ENCAP_LEN)
95 +
96 static struct ib_qp_attr ipoib_cm_err_attr = {
97 .qp_state = IB_QPS_ERR
98 };
99 @@ -147,15 +149,15 @@ static struct sk_buff *ipoib_cm_alloc_rx
100 struct sk_buff *skb;
101 int i;
102
103 - skb = dev_alloc_skb(IPOIB_CM_HEAD_SIZE + 12);
104 + skb = dev_alloc_skb(ALIGN(IPOIB_CM_HEAD_SIZE + IPOIB_PSEUDO_LEN, 16));
105 if (unlikely(!skb))
106 return NULL;
107
108 /*
109 - * IPoIB adds a 4 byte header. So we need 12 more bytes to align the
110 + * IPoIB adds a IPOIB_ENCAP_LEN byte header, this will align the
111 * IP header to a multiple of 16.
112 */
113 - skb_reserve(skb, 12);
114 + skb_reserve(skb, IPOIB_CM_RX_RESERVE);
115
116 mapping[0] = ib_dma_map_single(priv->ca, skb->data, IPOIB_CM_HEAD_SIZE,
117 DMA_FROM_DEVICE);
118 @@ -624,9 +626,9 @@ void ipoib_cm_handle_rx_wc(struct net_de
119 if (wc->byte_len < IPOIB_CM_COPYBREAK) {
120 int dlen = wc->byte_len;
121
122 - small_skb = dev_alloc_skb(dlen + 12);
123 + small_skb = dev_alloc_skb(dlen + IPOIB_CM_RX_RESERVE);
124 if (small_skb) {
125 - skb_reserve(small_skb, 12);
126 + skb_reserve(small_skb, IPOIB_CM_RX_RESERVE);
127 ib_dma_sync_single_for_cpu(priv->ca, rx_ring[wr_id].mapping[0],
128 dlen, DMA_FROM_DEVICE);
129 skb_copy_from_linear_data(skb, small_skb->data, dlen);
130 @@ -663,8 +665,7 @@ void ipoib_cm_handle_rx_wc(struct net_de
131
132 copied:
133 skb->protocol = ((struct ipoib_header *) skb->data)->proto;
134 - skb_reset_mac_header(skb);
135 - skb_pull(skb, IPOIB_ENCAP_LEN);
136 + skb_add_pseudo_hdr(skb);
137
138 ++dev->stats.rx_packets;
139 dev->stats.rx_bytes += skb->len;
140 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
141 +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
142 @@ -130,16 +130,15 @@ static struct sk_buff *ipoib_alloc_rx_sk
143
144 buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
145
146 - skb = dev_alloc_skb(buf_size + IPOIB_ENCAP_LEN);
147 + skb = dev_alloc_skb(buf_size + IPOIB_HARD_LEN);
148 if (unlikely(!skb))
149 return NULL;
150
151 /*
152 - * IB will leave a 40 byte gap for a GRH and IPoIB adds a 4 byte
153 - * header. So we need 4 more bytes to get to 48 and align the
154 - * IP header to a multiple of 16.
155 + * the IP header will be at IPOIP_HARD_LEN + IB_GRH_BYTES, that is
156 + * 64 bytes aligned
157 */
158 - skb_reserve(skb, 4);
159 + skb_reserve(skb, sizeof(struct ipoib_pseudo_header));
160
161 mapping = priv->rx_ring[id].mapping;
162 mapping[0] = ib_dma_map_single(priv->ca, skb->data, buf_size,
163 @@ -242,8 +241,7 @@ static void ipoib_ib_handle_rx_wc(struct
164 skb_pull(skb, IB_GRH_BYTES);
165
166 skb->protocol = ((struct ipoib_header *) skb->data)->proto;
167 - skb_reset_mac_header(skb);
168 - skb_pull(skb, IPOIB_ENCAP_LEN);
169 + skb_add_pseudo_hdr(skb);
170
171 ++dev->stats.rx_packets;
172 dev->stats.rx_bytes += skb->len;
173 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
174 +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
175 @@ -850,9 +850,12 @@ static void neigh_add_path(struct sk_buf
176 ipoib_neigh_free(neigh);
177 goto err_drop;
178 }
179 - if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)
180 + if (skb_queue_len(&neigh->queue) <
181 + IPOIB_MAX_PATH_REC_QUEUE) {
182 + /* put pseudoheader back on for next time */
183 + skb_push(skb, IPOIB_PSEUDO_LEN);
184 __skb_queue_tail(&neigh->queue, skb);
185 - else {
186 + } else {
187 ipoib_warn(priv, "queue length limit %d. Packet drop.\n",
188 skb_queue_len(&neigh->queue));
189 goto err_drop;
190 @@ -889,7 +892,7 @@ err_drop:
191 }
192
193 static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
194 - struct ipoib_cb *cb)
195 + struct ipoib_pseudo_header *phdr)
196 {
197 struct ipoib_dev_priv *priv = netdev_priv(dev);
198 struct ipoib_path *path;
199 @@ -897,16 +900,18 @@ static void unicast_arp_send(struct sk_b
200
201 spin_lock_irqsave(&priv->lock, flags);
202
203 - path = __path_find(dev, cb->hwaddr + 4);
204 + path = __path_find(dev, phdr->hwaddr + 4);
205 if (!path || !path->valid) {
206 int new_path = 0;
207
208 if (!path) {
209 - path = path_rec_create(dev, cb->hwaddr + 4);
210 + path = path_rec_create(dev, phdr->hwaddr + 4);
211 new_path = 1;
212 }
213 if (path) {
214 if (skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
215 + /* put pseudoheader back on for next time */
216 + skb_push(skb, IPOIB_PSEUDO_LEN);
217 __skb_queue_tail(&path->queue, skb);
218 } else {
219 ++dev->stats.tx_dropped;
220 @@ -934,10 +939,12 @@ static void unicast_arp_send(struct sk_b
221 be16_to_cpu(path->pathrec.dlid));
222
223 spin_unlock_irqrestore(&priv->lock, flags);
224 - ipoib_send(dev, skb, path->ah, IPOIB_QPN(cb->hwaddr));
225 + ipoib_send(dev, skb, path->ah, IPOIB_QPN(phdr->hwaddr));
226 return;
227 } else if ((path->query || !path_rec_start(dev, path)) &&
228 skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
229 + /* put pseudoheader back on for next time */
230 + skb_push(skb, IPOIB_PSEUDO_LEN);
231 __skb_queue_tail(&path->queue, skb);
232 } else {
233 ++dev->stats.tx_dropped;
234 @@ -951,13 +958,15 @@ static int ipoib_start_xmit(struct sk_bu
235 {
236 struct ipoib_dev_priv *priv = netdev_priv(dev);
237 struct ipoib_neigh *neigh;
238 - struct ipoib_cb *cb = ipoib_skb_cb(skb);
239 + struct ipoib_pseudo_header *phdr;
240 struct ipoib_header *header;
241 unsigned long flags;
242
243 + phdr = (struct ipoib_pseudo_header *) skb->data;
244 + skb_pull(skb, sizeof(*phdr));
245 header = (struct ipoib_header *) skb->data;
246
247 - if (unlikely(cb->hwaddr[4] == 0xff)) {
248 + if (unlikely(phdr->hwaddr[4] == 0xff)) {
249 /* multicast, arrange "if" according to probability */
250 if ((header->proto != htons(ETH_P_IP)) &&
251 (header->proto != htons(ETH_P_IPV6)) &&
252 @@ -970,13 +979,13 @@ static int ipoib_start_xmit(struct sk_bu
253 return NETDEV_TX_OK;
254 }
255 /* Add in the P_Key for multicast*/
256 - cb->hwaddr[8] = (priv->pkey >> 8) & 0xff;
257 - cb->hwaddr[9] = priv->pkey & 0xff;
258 + phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff;
259 + phdr->hwaddr[9] = priv->pkey & 0xff;
260
261 - neigh = ipoib_neigh_get(dev, cb->hwaddr);
262 + neigh = ipoib_neigh_get(dev, phdr->hwaddr);
263 if (likely(neigh))
264 goto send_using_neigh;
265 - ipoib_mcast_send(dev, cb->hwaddr, skb);
266 + ipoib_mcast_send(dev, phdr->hwaddr, skb);
267 return NETDEV_TX_OK;
268 }
269
270 @@ -985,16 +994,16 @@ static int ipoib_start_xmit(struct sk_bu
271 case htons(ETH_P_IP):
272 case htons(ETH_P_IPV6):
273 case htons(ETH_P_TIPC):
274 - neigh = ipoib_neigh_get(dev, cb->hwaddr);
275 + neigh = ipoib_neigh_get(dev, phdr->hwaddr);
276 if (unlikely(!neigh)) {
277 - neigh_add_path(skb, cb->hwaddr, dev);
278 + neigh_add_path(skb, phdr->hwaddr, dev);
279 return NETDEV_TX_OK;
280 }
281 break;
282 case htons(ETH_P_ARP):
283 case htons(ETH_P_RARP):
284 /* for unicast ARP and RARP should always perform path find */
285 - unicast_arp_send(skb, dev, cb);
286 + unicast_arp_send(skb, dev, phdr);
287 return NETDEV_TX_OK;
288 default:
289 /* ethertype not supported by IPoIB */
290 @@ -1011,11 +1020,13 @@ send_using_neigh:
291 goto unref;
292 }
293 } else if (neigh->ah) {
294 - ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(cb->hwaddr));
295 + ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(phdr->hwaddr));
296 goto unref;
297 }
298
299 if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
300 + /* put pseudoheader back on for next time */
301 + skb_push(skb, sizeof(*phdr));
302 spin_lock_irqsave(&priv->lock, flags);
303 __skb_queue_tail(&neigh->queue, skb);
304 spin_unlock_irqrestore(&priv->lock, flags);
305 @@ -1047,8 +1058,8 @@ static int ipoib_hard_header(struct sk_b
306 unsigned short type,
307 const void *daddr, const void *saddr, unsigned len)
308 {
309 + struct ipoib_pseudo_header *phdr;
310 struct ipoib_header *header;
311 - struct ipoib_cb *cb = ipoib_skb_cb(skb);
312
313 header = (struct ipoib_header *) skb_push(skb, sizeof *header);
314
315 @@ -1057,12 +1068,13 @@ static int ipoib_hard_header(struct sk_b
316
317 /*
318 * we don't rely on dst_entry structure, always stuff the
319 - * destination address into skb->cb so we can figure out where
320 + * destination address into skb hard header so we can figure out where
321 * to send the packet later.
322 */
323 - memcpy(cb->hwaddr, daddr, INFINIBAND_ALEN);
324 + phdr = (struct ipoib_pseudo_header *) skb_push(skb, sizeof(*phdr));
325 + memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN);
326
327 - return sizeof *header;
328 + return IPOIB_HARD_LEN;
329 }
330
331 static void ipoib_set_mcast_list(struct net_device *dev)
332 @@ -1638,7 +1650,7 @@ void ipoib_setup(struct net_device *dev)
333
334 dev->flags |= IFF_BROADCAST | IFF_MULTICAST;
335
336 - dev->hard_header_len = IPOIB_ENCAP_LEN;
337 + dev->hard_header_len = IPOIB_HARD_LEN;
338 dev->addr_len = INFINIBAND_ALEN;
339 dev->type = ARPHRD_INFINIBAND;
340 dev->tx_queue_len = ipoib_sendq_size * 2;
341 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
342 +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
343 @@ -756,9 +756,11 @@ void ipoib_mcast_send(struct net_device
344 __ipoib_mcast_add(dev, mcast);
345 list_add_tail(&mcast->list, &priv->multicast_list);
346 }
347 - if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE)
348 + if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE) {
349 + /* put pseudoheader back on for next time */
350 + skb_push(skb, sizeof(struct ipoib_pseudo_header));
351 skb_queue_tail(&mcast->pkt_queue, skb);
352 - else {
353 + } else {
354 ++dev->stats.tx_dropped;
355 dev_kfree_skb_any(skb);
356 }