]> git.ipfire.org Git - thirdparty/linux.git/blob - net/mptcp/options.c
Merge tag 'io_uring-5.7-2020-05-22' of git://git.kernel.dk/linux-block
[thirdparty/linux.git] / net / mptcp / options.c
1 // SPDX-License-Identifier: GPL-2.0
2 /* Multipath TCP
3 *
4 * Copyright (c) 2017 - 2019, Intel Corporation.
5 */
6
7 #define pr_fmt(fmt) "MPTCP: " fmt
8
9 #include <linux/kernel.h>
10 #include <net/tcp.h>
11 #include <net/mptcp.h>
12 #include "protocol.h"
13
14 static bool mptcp_cap_flag_sha256(u8 flags)
15 {
16 return (flags & MPTCP_CAP_FLAG_MASK) == MPTCP_CAP_HMAC_SHA256;
17 }
18
19 static void mptcp_parse_option(const struct sk_buff *skb,
20 const unsigned char *ptr, int opsize,
21 struct mptcp_options_received *mp_opt)
22 {
23 u8 subtype = *ptr >> 4;
24 int expected_opsize;
25 u8 version;
26 u8 flags;
27
28 switch (subtype) {
29 case MPTCPOPT_MP_CAPABLE:
30 /* strict size checking */
31 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
32 if (skb->len > tcp_hdr(skb)->doff << 2)
33 expected_opsize = TCPOLEN_MPTCP_MPC_ACK_DATA;
34 else
35 expected_opsize = TCPOLEN_MPTCP_MPC_ACK;
36 } else {
37 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)
38 expected_opsize = TCPOLEN_MPTCP_MPC_SYNACK;
39 else
40 expected_opsize = TCPOLEN_MPTCP_MPC_SYN;
41 }
42 if (opsize != expected_opsize)
43 break;
44
45 /* try to be gentle vs future versions on the initial syn */
46 version = *ptr++ & MPTCP_VERSION_MASK;
47 if (opsize != TCPOLEN_MPTCP_MPC_SYN) {
48 if (version != MPTCP_SUPPORTED_VERSION)
49 break;
50 } else if (version < MPTCP_SUPPORTED_VERSION) {
51 break;
52 }
53
54 flags = *ptr++;
55 if (!mptcp_cap_flag_sha256(flags) ||
56 (flags & MPTCP_CAP_EXTENSIBILITY))
57 break;
58
59 /* RFC 6824, Section 3.1:
60 * "For the Checksum Required bit (labeled "A"), if either
61 * host requires the use of checksums, checksums MUST be used.
62 * In other words, the only way for checksums not to be used
63 * is if both hosts in their SYNs set A=0."
64 *
65 * Section 3.3.0:
66 * "If a checksum is not present when its use has been
67 * negotiated, the receiver MUST close the subflow with a RST as
68 * it is considered broken."
69 *
70 * We don't implement DSS checksum - fall back to TCP.
71 */
72 if (flags & MPTCP_CAP_CHECKSUM_REQD)
73 break;
74
75 mp_opt->mp_capable = 1;
76 if (opsize >= TCPOLEN_MPTCP_MPC_SYNACK) {
77 mp_opt->sndr_key = get_unaligned_be64(ptr);
78 ptr += 8;
79 }
80 if (opsize >= TCPOLEN_MPTCP_MPC_ACK) {
81 mp_opt->rcvr_key = get_unaligned_be64(ptr);
82 ptr += 8;
83 }
84 if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA) {
85 /* Section 3.1.:
86 * "the data parameters in a MP_CAPABLE are semantically
87 * equivalent to those in a DSS option and can be used
88 * interchangeably."
89 */
90 mp_opt->dss = 1;
91 mp_opt->use_map = 1;
92 mp_opt->mpc_map = 1;
93 mp_opt->data_len = get_unaligned_be16(ptr);
94 ptr += 2;
95 }
96 pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d",
97 version, flags, opsize, mp_opt->sndr_key,
98 mp_opt->rcvr_key, mp_opt->data_len);
99 break;
100
101 case MPTCPOPT_MP_JOIN:
102 mp_opt->mp_join = 1;
103 if (opsize == TCPOLEN_MPTCP_MPJ_SYN) {
104 mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP;
105 mp_opt->join_id = *ptr++;
106 mp_opt->token = get_unaligned_be32(ptr);
107 ptr += 4;
108 mp_opt->nonce = get_unaligned_be32(ptr);
109 ptr += 4;
110 pr_debug("MP_JOIN bkup=%u, id=%u, token=%u, nonce=%u",
111 mp_opt->backup, mp_opt->join_id,
112 mp_opt->token, mp_opt->nonce);
113 } else if (opsize == TCPOLEN_MPTCP_MPJ_SYNACK) {
114 mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP;
115 mp_opt->join_id = *ptr++;
116 mp_opt->thmac = get_unaligned_be64(ptr);
117 ptr += 8;
118 mp_opt->nonce = get_unaligned_be32(ptr);
119 ptr += 4;
120 pr_debug("MP_JOIN bkup=%u, id=%u, thmac=%llu, nonce=%u",
121 mp_opt->backup, mp_opt->join_id,
122 mp_opt->thmac, mp_opt->nonce);
123 } else if (opsize == TCPOLEN_MPTCP_MPJ_ACK) {
124 ptr += 2;
125 memcpy(mp_opt->hmac, ptr, MPTCPOPT_HMAC_LEN);
126 pr_debug("MP_JOIN hmac");
127 } else {
128 pr_warn("MP_JOIN bad option size");
129 mp_opt->mp_join = 0;
130 }
131 break;
132
133 case MPTCPOPT_DSS:
134 pr_debug("DSS");
135 ptr++;
136
137 /* we must clear 'mpc_map' be able to detect MP_CAPABLE
138 * map vs DSS map in mptcp_incoming_options(), and reconstruct
139 * map info accordingly
140 */
141 mp_opt->mpc_map = 0;
142 flags = (*ptr++) & MPTCP_DSS_FLAG_MASK;
143 mp_opt->data_fin = (flags & MPTCP_DSS_DATA_FIN) != 0;
144 mp_opt->dsn64 = (flags & MPTCP_DSS_DSN64) != 0;
145 mp_opt->use_map = (flags & MPTCP_DSS_HAS_MAP) != 0;
146 mp_opt->ack64 = (flags & MPTCP_DSS_ACK64) != 0;
147 mp_opt->use_ack = (flags & MPTCP_DSS_HAS_ACK);
148
149 pr_debug("data_fin=%d dsn64=%d use_map=%d ack64=%d use_ack=%d",
150 mp_opt->data_fin, mp_opt->dsn64,
151 mp_opt->use_map, mp_opt->ack64,
152 mp_opt->use_ack);
153
154 expected_opsize = TCPOLEN_MPTCP_DSS_BASE;
155
156 if (mp_opt->use_ack) {
157 if (mp_opt->ack64)
158 expected_opsize += TCPOLEN_MPTCP_DSS_ACK64;
159 else
160 expected_opsize += TCPOLEN_MPTCP_DSS_ACK32;
161 }
162
163 if (mp_opt->use_map) {
164 if (mp_opt->dsn64)
165 expected_opsize += TCPOLEN_MPTCP_DSS_MAP64;
166 else
167 expected_opsize += TCPOLEN_MPTCP_DSS_MAP32;
168 }
169
170 /* RFC 6824, Section 3.3:
171 * If a checksum is present, but its use had
172 * not been negotiated in the MP_CAPABLE handshake,
173 * the checksum field MUST be ignored.
174 */
175 if (opsize != expected_opsize &&
176 opsize != expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM)
177 break;
178
179 mp_opt->dss = 1;
180
181 if (mp_opt->use_ack) {
182 if (mp_opt->ack64) {
183 mp_opt->data_ack = get_unaligned_be64(ptr);
184 ptr += 8;
185 } else {
186 mp_opt->data_ack = get_unaligned_be32(ptr);
187 ptr += 4;
188 }
189
190 pr_debug("data_ack=%llu", mp_opt->data_ack);
191 }
192
193 if (mp_opt->use_map) {
194 if (mp_opt->dsn64) {
195 mp_opt->data_seq = get_unaligned_be64(ptr);
196 ptr += 8;
197 } else {
198 mp_opt->data_seq = get_unaligned_be32(ptr);
199 ptr += 4;
200 }
201
202 mp_opt->subflow_seq = get_unaligned_be32(ptr);
203 ptr += 4;
204
205 mp_opt->data_len = get_unaligned_be16(ptr);
206 ptr += 2;
207
208 pr_debug("data_seq=%llu subflow_seq=%u data_len=%u",
209 mp_opt->data_seq, mp_opt->subflow_seq,
210 mp_opt->data_len);
211 }
212
213 break;
214
215 case MPTCPOPT_ADD_ADDR:
216 mp_opt->echo = (*ptr++) & MPTCP_ADDR_ECHO;
217 if (!mp_opt->echo) {
218 if (opsize == TCPOLEN_MPTCP_ADD_ADDR ||
219 opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT)
220 mp_opt->family = MPTCP_ADDR_IPVERSION_4;
221 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
222 else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6 ||
223 opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT)
224 mp_opt->family = MPTCP_ADDR_IPVERSION_6;
225 #endif
226 else
227 break;
228 } else {
229 if (opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE ||
230 opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT)
231 mp_opt->family = MPTCP_ADDR_IPVERSION_4;
232 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
233 else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE ||
234 opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT)
235 mp_opt->family = MPTCP_ADDR_IPVERSION_6;
236 #endif
237 else
238 break;
239 }
240
241 mp_opt->add_addr = 1;
242 mp_opt->port = 0;
243 mp_opt->addr_id = *ptr++;
244 pr_debug("ADD_ADDR: id=%d", mp_opt->addr_id);
245 if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) {
246 memcpy((u8 *)&mp_opt->addr.s_addr, (u8 *)ptr, 4);
247 ptr += 4;
248 if (opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT ||
249 opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT) {
250 mp_opt->port = get_unaligned_be16(ptr);
251 ptr += 2;
252 }
253 }
254 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
255 else {
256 memcpy(mp_opt->addr6.s6_addr, (u8 *)ptr, 16);
257 ptr += 16;
258 if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT ||
259 opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT) {
260 mp_opt->port = get_unaligned_be16(ptr);
261 ptr += 2;
262 }
263 }
264 #endif
265 if (!mp_opt->echo) {
266 mp_opt->ahmac = get_unaligned_be64(ptr);
267 ptr += 8;
268 }
269 break;
270
271 case MPTCPOPT_RM_ADDR:
272 if (opsize != TCPOLEN_MPTCP_RM_ADDR_BASE)
273 break;
274
275 mp_opt->rm_addr = 1;
276 mp_opt->rm_id = *ptr++;
277 pr_debug("RM_ADDR: id=%d", mp_opt->rm_id);
278 break;
279
280 default:
281 break;
282 }
283 }
284
285 void mptcp_get_options(const struct sk_buff *skb,
286 struct mptcp_options_received *mp_opt)
287 {
288 const struct tcphdr *th = tcp_hdr(skb);
289 const unsigned char *ptr;
290 int length;
291
292 /* initialize option status */
293 mp_opt->mp_capable = 0;
294 mp_opt->mp_join = 0;
295 mp_opt->add_addr = 0;
296 mp_opt->rm_addr = 0;
297 mp_opt->dss = 0;
298
299 length = (th->doff * 4) - sizeof(struct tcphdr);
300 ptr = (const unsigned char *)(th + 1);
301
302 while (length > 0) {
303 int opcode = *ptr++;
304 int opsize;
305
306 switch (opcode) {
307 case TCPOPT_EOL:
308 return;
309 case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
310 length--;
311 continue;
312 default:
313 opsize = *ptr++;
314 if (opsize < 2) /* "silly options" */
315 return;
316 if (opsize > length)
317 return; /* don't parse partial options */
318 if (opcode == TCPOPT_MPTCP)
319 mptcp_parse_option(skb, ptr, opsize, mp_opt);
320 ptr += opsize - 2;
321 length -= opsize;
322 }
323 }
324 }
325
326 bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb,
327 unsigned int *size, struct mptcp_out_options *opts)
328 {
329 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
330
331 /* we will use snd_isn to detect first pkt [re]transmission
332 * in mptcp_established_options_mp()
333 */
334 subflow->snd_isn = TCP_SKB_CB(skb)->end_seq;
335 if (subflow->request_mptcp) {
336 pr_debug("local_key=%llu", subflow->local_key);
337 opts->suboptions = OPTION_MPTCP_MPC_SYN;
338 opts->sndr_key = subflow->local_key;
339 *size = TCPOLEN_MPTCP_MPC_SYN;
340 return true;
341 } else if (subflow->request_join) {
342 pr_debug("remote_token=%u, nonce=%u", subflow->remote_token,
343 subflow->local_nonce);
344 opts->suboptions = OPTION_MPTCP_MPJ_SYN;
345 opts->join_id = subflow->local_id;
346 opts->token = subflow->remote_token;
347 opts->nonce = subflow->local_nonce;
348 opts->backup = subflow->request_bkup;
349 *size = TCPOLEN_MPTCP_MPJ_SYN;
350 return true;
351 }
352 return false;
353 }
354
355 /* MP_JOIN client subflow must wait for 4th ack before sending any data:
356 * TCP can't schedule delack timer before the subflow is fully established.
357 * MPTCP uses the delack timer to do 3rd ack retransmissions
358 */
359 static void schedule_3rdack_retransmission(struct sock *sk)
360 {
361 struct inet_connection_sock *icsk = inet_csk(sk);
362 struct tcp_sock *tp = tcp_sk(sk);
363 unsigned long timeout;
364
365 /* reschedule with a timeout above RTT, as we must look only for drop */
366 if (tp->srtt_us)
367 timeout = tp->srtt_us << 1;
368 else
369 timeout = TCP_TIMEOUT_INIT;
370
371 WARN_ON_ONCE(icsk->icsk_ack.pending & ICSK_ACK_TIMER);
372 icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
373 icsk->icsk_ack.timeout = timeout;
374 sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
375 }
376
377 static void clear_3rdack_retransmission(struct sock *sk)
378 {
379 struct inet_connection_sock *icsk = inet_csk(sk);
380
381 sk_stop_timer(sk, &icsk->icsk_delack_timer);
382 icsk->icsk_ack.timeout = 0;
383 icsk->icsk_ack.ato = 0;
384 icsk->icsk_ack.pending &= ~(ICSK_ACK_SCHED | ICSK_ACK_TIMER);
385 }
386
387 static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
388 unsigned int *size,
389 unsigned int remaining,
390 struct mptcp_out_options *opts)
391 {
392 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
393 struct mptcp_ext *mpext;
394 unsigned int data_len;
395
396 /* When skb is not available, we better over-estimate the emitted
397 * options len. A full DSS option (28 bytes) is longer than
398 * TCPOLEN_MPTCP_MPC_ACK_DATA(22) or TCPOLEN_MPTCP_MPJ_ACK(24), so
399 * tell the caller to defer the estimate to
400 * mptcp_established_options_dss(), which will reserve enough space.
401 */
402 if (!skb)
403 return false;
404
405 /* MPC/MPJ needed only on 3rd ack packet */
406 if (subflow->fully_established ||
407 subflow->snd_isn != TCP_SKB_CB(skb)->seq)
408 return false;
409
410 if (subflow->mp_capable) {
411 mpext = mptcp_get_ext(skb);
412 data_len = mpext ? mpext->data_len : 0;
413
414 /* we will check ext_copy.data_len in mptcp_write_options() to
415 * discriminate between TCPOLEN_MPTCP_MPC_ACK_DATA and
416 * TCPOLEN_MPTCP_MPC_ACK
417 */
418 opts->ext_copy.data_len = data_len;
419 opts->suboptions = OPTION_MPTCP_MPC_ACK;
420 opts->sndr_key = subflow->local_key;
421 opts->rcvr_key = subflow->remote_key;
422
423 /* Section 3.1.
424 * The MP_CAPABLE option is carried on the SYN, SYN/ACK, and ACK
425 * packets that start the first subflow of an MPTCP connection,
426 * as well as the first packet that carries data
427 */
428 if (data_len > 0)
429 *size = ALIGN(TCPOLEN_MPTCP_MPC_ACK_DATA, 4);
430 else
431 *size = TCPOLEN_MPTCP_MPC_ACK;
432
433 pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d",
434 subflow, subflow->local_key, subflow->remote_key,
435 data_len);
436
437 return true;
438 } else if (subflow->mp_join) {
439 opts->suboptions = OPTION_MPTCP_MPJ_ACK;
440 memcpy(opts->hmac, subflow->hmac, MPTCPOPT_HMAC_LEN);
441 *size = TCPOLEN_MPTCP_MPJ_ACK;
442 pr_debug("subflow=%p", subflow);
443
444 schedule_3rdack_retransmission(sk);
445 return true;
446 }
447 return false;
448 }
449
450 static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow,
451 struct mptcp_ext *ext)
452 {
453 if (!ext->use_map) {
454 /* RFC6824 requires a DSS mapping with specific values
455 * if DATA_FIN is set but no data payload is mapped
456 */
457 ext->data_fin = 1;
458 ext->use_map = 1;
459 ext->dsn64 = 1;
460 ext->data_seq = subflow->data_fin_tx_seq;
461 ext->subflow_seq = 0;
462 ext->data_len = 1;
463 } else if (ext->data_seq + ext->data_len == subflow->data_fin_tx_seq) {
464 /* If there's an existing DSS mapping and it is the
465 * final mapping, DATA_FIN consumes 1 additional byte of
466 * mapping space.
467 */
468 ext->data_fin = 1;
469 ext->data_len++;
470 }
471 }
472
473 static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
474 unsigned int *size,
475 unsigned int remaining,
476 struct mptcp_out_options *opts)
477 {
478 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
479 unsigned int dss_size = 0;
480 struct mptcp_ext *mpext;
481 struct mptcp_sock *msk;
482 unsigned int ack_size;
483 bool ret = false;
484 u8 tcp_fin;
485
486 if (skb) {
487 mpext = mptcp_get_ext(skb);
488 tcp_fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
489 } else {
490 mpext = NULL;
491 tcp_fin = 0;
492 }
493
494 if (!skb || (mpext && mpext->use_map) || tcp_fin) {
495 unsigned int map_size;
496
497 map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64;
498
499 remaining -= map_size;
500 dss_size = map_size;
501 if (mpext)
502 opts->ext_copy = *mpext;
503
504 if (skb && tcp_fin && subflow->data_fin_tx_enable)
505 mptcp_write_data_fin(subflow, &opts->ext_copy);
506 ret = true;
507 }
508
509 /* passive sockets msk will set the 'can_ack' after accept(), even
510 * if the first subflow may have the already the remote key handy
511 */
512 opts->ext_copy.use_ack = 0;
513 msk = mptcp_sk(subflow->conn);
514 if (!READ_ONCE(msk->can_ack)) {
515 *size = ALIGN(dss_size, 4);
516 return ret;
517 }
518
519 ack_size = TCPOLEN_MPTCP_DSS_ACK64;
520
521 /* Add kind/length/subtype/flag overhead if mapping is not populated */
522 if (dss_size == 0)
523 ack_size += TCPOLEN_MPTCP_DSS_BASE;
524
525 dss_size += ack_size;
526
527 opts->ext_copy.data_ack = msk->ack_seq;
528 opts->ext_copy.ack64 = 1;
529 opts->ext_copy.use_ack = 1;
530
531 *size = ALIGN(dss_size, 4);
532 return true;
533 }
534
535 static u64 add_addr_generate_hmac(u64 key1, u64 key2, u8 addr_id,
536 struct in_addr *addr)
537 {
538 u8 hmac[MPTCP_ADDR_HMAC_LEN];
539 u8 msg[7];
540
541 msg[0] = addr_id;
542 memcpy(&msg[1], &addr->s_addr, 4);
543 msg[5] = 0;
544 msg[6] = 0;
545
546 mptcp_crypto_hmac_sha(key1, key2, msg, 7, hmac);
547
548 return get_unaligned_be64(hmac);
549 }
550
551 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
552 static u64 add_addr6_generate_hmac(u64 key1, u64 key2, u8 addr_id,
553 struct in6_addr *addr)
554 {
555 u8 hmac[MPTCP_ADDR_HMAC_LEN];
556 u8 msg[19];
557
558 msg[0] = addr_id;
559 memcpy(&msg[1], &addr->s6_addr, 16);
560 msg[17] = 0;
561 msg[18] = 0;
562
563 mptcp_crypto_hmac_sha(key1, key2, msg, 19, hmac);
564
565 return get_unaligned_be64(hmac);
566 }
567 #endif
568
569 static bool mptcp_established_options_addr(struct sock *sk,
570 unsigned int *size,
571 unsigned int remaining,
572 struct mptcp_out_options *opts)
573 {
574 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
575 struct mptcp_sock *msk = mptcp_sk(subflow->conn);
576 struct mptcp_addr_info saddr;
577 int len;
578
579 if (!mptcp_pm_should_signal(msk) ||
580 !(mptcp_pm_addr_signal(msk, remaining, &saddr)))
581 return false;
582
583 len = mptcp_add_addr_len(saddr.family);
584 if (remaining < len)
585 return false;
586
587 *size = len;
588 opts->addr_id = saddr.id;
589 if (saddr.family == AF_INET) {
590 opts->suboptions |= OPTION_MPTCP_ADD_ADDR;
591 opts->addr = saddr.addr;
592 opts->ahmac = add_addr_generate_hmac(msk->local_key,
593 msk->remote_key,
594 opts->addr_id,
595 &opts->addr);
596 }
597 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
598 else if (saddr.family == AF_INET6) {
599 opts->suboptions |= OPTION_MPTCP_ADD_ADDR6;
600 opts->addr6 = saddr.addr6;
601 opts->ahmac = add_addr6_generate_hmac(msk->local_key,
602 msk->remote_key,
603 opts->addr_id,
604 &opts->addr6);
605 }
606 #endif
607 pr_debug("addr_id=%d, ahmac=%llu", opts->addr_id, opts->ahmac);
608
609 return true;
610 }
611
612 bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
613 unsigned int *size, unsigned int remaining,
614 struct mptcp_out_options *opts)
615 {
616 unsigned int opt_size = 0;
617 bool ret = false;
618
619 opts->suboptions = 0;
620
621 if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts))
622 ret = true;
623 else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining,
624 opts))
625 ret = true;
626
627 /* we reserved enough space for the above options, and exceeding the
628 * TCP option space would be fatal
629 */
630 if (WARN_ON_ONCE(opt_size > remaining))
631 return false;
632
633 *size += opt_size;
634 remaining -= opt_size;
635 if (mptcp_established_options_addr(sk, &opt_size, remaining, opts)) {
636 *size += opt_size;
637 remaining -= opt_size;
638 ret = true;
639 }
640
641 return ret;
642 }
643
644 bool mptcp_synack_options(const struct request_sock *req, unsigned int *size,
645 struct mptcp_out_options *opts)
646 {
647 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
648
649 if (subflow_req->mp_capable) {
650 opts->suboptions = OPTION_MPTCP_MPC_SYNACK;
651 opts->sndr_key = subflow_req->local_key;
652 *size = TCPOLEN_MPTCP_MPC_SYNACK;
653 pr_debug("subflow_req=%p, local_key=%llu",
654 subflow_req, subflow_req->local_key);
655 return true;
656 } else if (subflow_req->mp_join) {
657 opts->suboptions = OPTION_MPTCP_MPJ_SYNACK;
658 opts->backup = subflow_req->backup;
659 opts->join_id = subflow_req->local_id;
660 opts->thmac = subflow_req->thmac;
661 opts->nonce = subflow_req->local_nonce;
662 pr_debug("req=%p, bkup=%u, id=%u, thmac=%llu, nonce=%u",
663 subflow_req, opts->backup, opts->join_id,
664 opts->thmac, opts->nonce);
665 *size = TCPOLEN_MPTCP_MPJ_SYNACK;
666 return true;
667 }
668 return false;
669 }
670
671 static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk,
672 struct mptcp_subflow_context *subflow,
673 struct sk_buff *skb,
674 struct mptcp_options_received *mp_opt)
675 {
676 /* here we can process OoO, in-window pkts, only in-sequence 4th ack
677 * will make the subflow fully established
678 */
679 if (likely(subflow->fully_established)) {
680 /* on passive sockets, check for 3rd ack retransmission
681 * note that msk is always set by subflow_syn_recv_sock()
682 * for mp_join subflows
683 */
684 if (TCP_SKB_CB(skb)->seq == subflow->ssn_offset + 1 &&
685 TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq &&
686 subflow->mp_join && mp_opt->mp_join &&
687 READ_ONCE(msk->pm.server_side))
688 tcp_send_ack(sk);
689 goto fully_established;
690 }
691
692 /* we should process OoO packets before the first subflow is fully
693 * established, but not expected for MP_JOIN subflows
694 */
695 if (TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1)
696 return subflow->mp_capable;
697
698 if (mp_opt->dss && mp_opt->use_ack) {
699 /* subflows are fully established as soon as we get any
700 * additional ack.
701 */
702 subflow->fully_established = 1;
703 goto fully_established;
704 }
705
706 /* If the first established packet does not contain MP_CAPABLE + data
707 * then fallback to TCP
708 */
709 if (!mp_opt->mp_capable) {
710 subflow->mp_capable = 0;
711 tcp_sk(sk)->is_mptcp = 0;
712 return false;
713 }
714
715 if (unlikely(!READ_ONCE(msk->pm.server_side)))
716 pr_warn_once("bogus mpc option on established client sk");
717 subflow->fully_established = 1;
718 subflow->remote_key = mp_opt->sndr_key;
719 subflow->can_ack = 1;
720
721 fully_established:
722 if (likely(subflow->pm_notified))
723 return true;
724
725 subflow->pm_notified = 1;
726 if (subflow->mp_join) {
727 clear_3rdack_retransmission(sk);
728 mptcp_pm_subflow_established(msk, subflow);
729 } else {
730 mptcp_pm_fully_established(msk);
731 }
732 return true;
733 }
734
735 static u64 expand_ack(u64 old_ack, u64 cur_ack, bool use_64bit)
736 {
737 u32 old_ack32, cur_ack32;
738
739 if (use_64bit)
740 return cur_ack;
741
742 old_ack32 = (u32)old_ack;
743 cur_ack32 = (u32)cur_ack;
744 cur_ack = (old_ack & GENMASK_ULL(63, 32)) + cur_ack32;
745 if (unlikely(before(cur_ack32, old_ack32)))
746 return cur_ack + (1LL << 32);
747 return cur_ack;
748 }
749
750 static void update_una(struct mptcp_sock *msk,
751 struct mptcp_options_received *mp_opt)
752 {
753 u64 new_snd_una, snd_una, old_snd_una = atomic64_read(&msk->snd_una);
754 u64 write_seq = READ_ONCE(msk->write_seq);
755
756 /* avoid ack expansion on update conflict, to reduce the risk of
757 * wrongly expanding to a future ack sequence number, which is way
758 * more dangerous than missing an ack
759 */
760 new_snd_una = expand_ack(old_snd_una, mp_opt->data_ack, mp_opt->ack64);
761
762 /* ACK for data not even sent yet? Ignore. */
763 if (after64(new_snd_una, write_seq))
764 new_snd_una = old_snd_una;
765
766 while (after64(new_snd_una, old_snd_una)) {
767 snd_una = old_snd_una;
768 old_snd_una = atomic64_cmpxchg(&msk->snd_una, snd_una,
769 new_snd_una);
770 if (old_snd_una == snd_una) {
771 mptcp_data_acked((struct sock *)msk);
772 break;
773 }
774 }
775 }
776
777 static bool add_addr_hmac_valid(struct mptcp_sock *msk,
778 struct mptcp_options_received *mp_opt)
779 {
780 u64 hmac = 0;
781
782 if (mp_opt->echo)
783 return true;
784
785 if (mp_opt->family == MPTCP_ADDR_IPVERSION_4)
786 hmac = add_addr_generate_hmac(msk->remote_key,
787 msk->local_key,
788 mp_opt->addr_id, &mp_opt->addr);
789 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
790 else
791 hmac = add_addr6_generate_hmac(msk->remote_key,
792 msk->local_key,
793 mp_opt->addr_id, &mp_opt->addr6);
794 #endif
795
796 pr_debug("msk=%p, ahmac=%llu, mp_opt->ahmac=%llu\n",
797 msk, (unsigned long long)hmac,
798 (unsigned long long)mp_opt->ahmac);
799
800 return hmac == mp_opt->ahmac;
801 }
802
803 void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,
804 struct tcp_options_received *opt_rx)
805 {
806 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
807 struct mptcp_sock *msk = mptcp_sk(subflow->conn);
808 struct mptcp_options_received mp_opt;
809 struct mptcp_ext *mpext;
810
811 mptcp_get_options(skb, &mp_opt);
812 if (!check_fully_established(msk, sk, subflow, skb, &mp_opt))
813 return;
814
815 if (mp_opt.add_addr && add_addr_hmac_valid(msk, &mp_opt)) {
816 struct mptcp_addr_info addr;
817
818 addr.port = htons(mp_opt.port);
819 addr.id = mp_opt.addr_id;
820 if (mp_opt.family == MPTCP_ADDR_IPVERSION_4) {
821 addr.family = AF_INET;
822 addr.addr = mp_opt.addr;
823 }
824 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
825 else if (mp_opt.family == MPTCP_ADDR_IPVERSION_6) {
826 addr.family = AF_INET6;
827 addr.addr6 = mp_opt.addr6;
828 }
829 #endif
830 if (!mp_opt.echo)
831 mptcp_pm_add_addr_received(msk, &addr);
832 mp_opt.add_addr = 0;
833 }
834
835 if (!mp_opt.dss)
836 return;
837
838 /* we can't wait for recvmsg() to update the ack_seq, otherwise
839 * monodirectional flows will stuck
840 */
841 if (mp_opt.use_ack)
842 update_una(msk, &mp_opt);
843
844 mpext = skb_ext_add(skb, SKB_EXT_MPTCP);
845 if (!mpext)
846 return;
847
848 memset(mpext, 0, sizeof(*mpext));
849
850 if (mp_opt.use_map) {
851 if (mp_opt.mpc_map) {
852 /* this is an MP_CAPABLE carrying MPTCP data
853 * we know this map the first chunk of data
854 */
855 mptcp_crypto_key_sha(subflow->remote_key, NULL,
856 &mpext->data_seq);
857 mpext->data_seq++;
858 mpext->subflow_seq = 1;
859 mpext->dsn64 = 1;
860 mpext->mpc_map = 1;
861 mpext->data_fin = 0;
862 } else {
863 mpext->data_seq = mp_opt.data_seq;
864 mpext->subflow_seq = mp_opt.subflow_seq;
865 mpext->dsn64 = mp_opt.dsn64;
866 mpext->data_fin = mp_opt.data_fin;
867 }
868 mpext->data_len = mp_opt.data_len;
869 mpext->use_map = 1;
870 }
871 }
872
873 void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts)
874 {
875 if ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK |
876 OPTION_MPTCP_MPC_ACK) & opts->suboptions) {
877 u8 len;
878
879 if (OPTION_MPTCP_MPC_SYN & opts->suboptions)
880 len = TCPOLEN_MPTCP_MPC_SYN;
881 else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions)
882 len = TCPOLEN_MPTCP_MPC_SYNACK;
883 else if (opts->ext_copy.data_len)
884 len = TCPOLEN_MPTCP_MPC_ACK_DATA;
885 else
886 len = TCPOLEN_MPTCP_MPC_ACK;
887
888 *ptr++ = mptcp_option(MPTCPOPT_MP_CAPABLE, len,
889 MPTCP_SUPPORTED_VERSION,
890 MPTCP_CAP_HMAC_SHA256);
891
892 if (!((OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) &
893 opts->suboptions))
894 goto mp_capable_done;
895
896 put_unaligned_be64(opts->sndr_key, ptr);
897 ptr += 2;
898 if (!((OPTION_MPTCP_MPC_ACK) & opts->suboptions))
899 goto mp_capable_done;
900
901 put_unaligned_be64(opts->rcvr_key, ptr);
902 ptr += 2;
903 if (!opts->ext_copy.data_len)
904 goto mp_capable_done;
905
906 put_unaligned_be32(opts->ext_copy.data_len << 16 |
907 TCPOPT_NOP << 8 | TCPOPT_NOP, ptr);
908 ptr += 1;
909 }
910
911 mp_capable_done:
912 if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) {
913 if (opts->ahmac)
914 *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR,
915 TCPOLEN_MPTCP_ADD_ADDR, 0,
916 opts->addr_id);
917 else
918 *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR,
919 TCPOLEN_MPTCP_ADD_ADDR_BASE,
920 MPTCP_ADDR_ECHO,
921 opts->addr_id);
922 memcpy((u8 *)ptr, (u8 *)&opts->addr.s_addr, 4);
923 ptr += 1;
924 if (opts->ahmac) {
925 put_unaligned_be64(opts->ahmac, ptr);
926 ptr += 2;
927 }
928 }
929
930 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
931 if (OPTION_MPTCP_ADD_ADDR6 & opts->suboptions) {
932 if (opts->ahmac)
933 *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR,
934 TCPOLEN_MPTCP_ADD_ADDR6, 0,
935 opts->addr_id);
936 else
937 *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR,
938 TCPOLEN_MPTCP_ADD_ADDR6_BASE,
939 MPTCP_ADDR_ECHO,
940 opts->addr_id);
941 memcpy((u8 *)ptr, opts->addr6.s6_addr, 16);
942 ptr += 4;
943 if (opts->ahmac) {
944 put_unaligned_be64(opts->ahmac, ptr);
945 ptr += 2;
946 }
947 }
948 #endif
949
950 if (OPTION_MPTCP_RM_ADDR & opts->suboptions) {
951 *ptr++ = mptcp_option(MPTCPOPT_RM_ADDR,
952 TCPOLEN_MPTCP_RM_ADDR_BASE,
953 0, opts->rm_id);
954 }
955
956 if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) {
957 *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
958 TCPOLEN_MPTCP_MPJ_SYN,
959 opts->backup, opts->join_id);
960 put_unaligned_be32(opts->token, ptr);
961 ptr += 1;
962 put_unaligned_be32(opts->nonce, ptr);
963 ptr += 1;
964 }
965
966 if (OPTION_MPTCP_MPJ_SYNACK & opts->suboptions) {
967 *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
968 TCPOLEN_MPTCP_MPJ_SYNACK,
969 opts->backup, opts->join_id);
970 put_unaligned_be64(opts->thmac, ptr);
971 ptr += 2;
972 put_unaligned_be32(opts->nonce, ptr);
973 ptr += 1;
974 }
975
976 if (OPTION_MPTCP_MPJ_ACK & opts->suboptions) {
977 *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
978 TCPOLEN_MPTCP_MPJ_ACK, 0, 0);
979 memcpy(ptr, opts->hmac, MPTCPOPT_HMAC_LEN);
980 ptr += 5;
981 }
982
983 if (opts->ext_copy.use_ack || opts->ext_copy.use_map) {
984 struct mptcp_ext *mpext = &opts->ext_copy;
985 u8 len = TCPOLEN_MPTCP_DSS_BASE;
986 u8 flags = 0;
987
988 if (mpext->use_ack) {
989 len += TCPOLEN_MPTCP_DSS_ACK64;
990 flags = MPTCP_DSS_HAS_ACK | MPTCP_DSS_ACK64;
991 }
992
993 if (mpext->use_map) {
994 len += TCPOLEN_MPTCP_DSS_MAP64;
995
996 /* Use only 64-bit mapping flags for now, add
997 * support for optional 32-bit mappings later.
998 */
999 flags |= MPTCP_DSS_HAS_MAP | MPTCP_DSS_DSN64;
1000 if (mpext->data_fin)
1001 flags |= MPTCP_DSS_DATA_FIN;
1002 }
1003
1004 *ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags);
1005
1006 if (mpext->use_ack) {
1007 put_unaligned_be64(mpext->data_ack, ptr);
1008 ptr += 2;
1009 }
1010
1011 if (mpext->use_map) {
1012 put_unaligned_be64(mpext->data_seq, ptr);
1013 ptr += 2;
1014 put_unaligned_be32(mpext->subflow_seq, ptr);
1015 ptr += 1;
1016 put_unaligned_be32(mpext->data_len << 16 |
1017 TCPOPT_NOP << 8 | TCPOPT_NOP, ptr);
1018 }
1019 }
1020 }