1 // SPDX-License-Identifier: GPL-2.0
4 * Copyright (c) 2017 - 2019, Intel Corporation.
7 #define pr_fmt(fmt) "MPTCP: " fmt
9 #include <linux/kernel.h>
11 #include <net/mptcp.h>
14 static bool mptcp_cap_flag_sha256(u8 flags
)
16 return (flags
& MPTCP_CAP_FLAG_MASK
) == MPTCP_CAP_HMAC_SHA256
;
19 void mptcp_parse_option(const struct sk_buff
*skb
, const unsigned char *ptr
,
20 int opsize
, struct tcp_options_received
*opt_rx
)
22 struct mptcp_options_received
*mp_opt
= &opt_rx
->mptcp
;
23 u8 subtype
= *ptr
>> 4;
29 case MPTCPOPT_MP_CAPABLE
:
30 /* strict size checking */
31 if (!(TCP_SKB_CB(skb
)->tcp_flags
& TCPHDR_SYN
)) {
32 if (skb
->len
> tcp_hdr(skb
)->doff
<< 2)
33 expected_opsize
= TCPOLEN_MPTCP_MPC_ACK_DATA
;
35 expected_opsize
= TCPOLEN_MPTCP_MPC_ACK
;
37 if (TCP_SKB_CB(skb
)->tcp_flags
& TCPHDR_ACK
)
38 expected_opsize
= TCPOLEN_MPTCP_MPC_SYNACK
;
40 expected_opsize
= TCPOLEN_MPTCP_MPC_SYN
;
42 if (opsize
!= expected_opsize
)
45 /* try to be gentle vs future versions on the initial syn */
46 version
= *ptr
++ & MPTCP_VERSION_MASK
;
47 if (opsize
!= TCPOLEN_MPTCP_MPC_SYN
) {
48 if (version
!= MPTCP_SUPPORTED_VERSION
)
50 } else if (version
< MPTCP_SUPPORTED_VERSION
) {
55 if (!mptcp_cap_flag_sha256(flags
) ||
56 (flags
& MPTCP_CAP_EXTENSIBILITY
))
59 /* RFC 6824, Section 3.1:
60 * "For the Checksum Required bit (labeled "A"), if either
61 * host requires the use of checksums, checksums MUST be used.
62 * In other words, the only way for checksums not to be used
63 * is if both hosts in their SYNs set A=0."
66 * "If a checksum is not present when its use has been
67 * negotiated, the receiver MUST close the subflow with a RST as
68 * it is considered broken."
70 * We don't implement DSS checksum - fall back to TCP.
72 if (flags
& MPTCP_CAP_CHECKSUM_REQD
)
75 mp_opt
->mp_capable
= 1;
76 if (opsize
>= TCPOLEN_MPTCP_MPC_SYNACK
) {
77 mp_opt
->sndr_key
= get_unaligned_be64(ptr
);
80 if (opsize
>= TCPOLEN_MPTCP_MPC_ACK
) {
81 mp_opt
->rcvr_key
= get_unaligned_be64(ptr
);
84 if (opsize
== TCPOLEN_MPTCP_MPC_ACK_DATA
) {
86 * "the data parameters in a MP_CAPABLE are semantically
87 * equivalent to those in a DSS option and can be used
93 mp_opt
->data_len
= get_unaligned_be16(ptr
);
96 pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d",
97 version
, flags
, opsize
, mp_opt
->sndr_key
,
98 mp_opt
->rcvr_key
, mp_opt
->data_len
);
101 case MPTCPOPT_MP_JOIN
:
103 if (opsize
== TCPOLEN_MPTCP_MPJ_SYN
) {
104 mp_opt
->backup
= *ptr
++ & MPTCPOPT_BACKUP
;
105 mp_opt
->join_id
= *ptr
++;
106 mp_opt
->token
= get_unaligned_be32(ptr
);
108 mp_opt
->nonce
= get_unaligned_be32(ptr
);
110 pr_debug("MP_JOIN bkup=%u, id=%u, token=%u, nonce=%u",
111 mp_opt
->backup
, mp_opt
->join_id
,
112 mp_opt
->token
, mp_opt
->nonce
);
113 } else if (opsize
== TCPOLEN_MPTCP_MPJ_SYNACK
) {
114 mp_opt
->backup
= *ptr
++ & MPTCPOPT_BACKUP
;
115 mp_opt
->join_id
= *ptr
++;
116 mp_opt
->thmac
= get_unaligned_be64(ptr
);
118 mp_opt
->nonce
= get_unaligned_be32(ptr
);
120 pr_debug("MP_JOIN bkup=%u, id=%u, thmac=%llu, nonce=%u",
121 mp_opt
->backup
, mp_opt
->join_id
,
122 mp_opt
->thmac
, mp_opt
->nonce
);
123 } else if (opsize
== TCPOLEN_MPTCP_MPJ_ACK
) {
125 memcpy(mp_opt
->hmac
, ptr
, MPTCPOPT_HMAC_LEN
);
126 pr_debug("MP_JOIN hmac");
128 pr_warn("MP_JOIN bad option size");
137 /* we must clear 'mpc_map' be able to detect MP_CAPABLE
138 * map vs DSS map in mptcp_incoming_options(), and reconstruct
139 * map info accordingly
142 flags
= (*ptr
++) & MPTCP_DSS_FLAG_MASK
;
143 mp_opt
->data_fin
= (flags
& MPTCP_DSS_DATA_FIN
) != 0;
144 mp_opt
->dsn64
= (flags
& MPTCP_DSS_DSN64
) != 0;
145 mp_opt
->use_map
= (flags
& MPTCP_DSS_HAS_MAP
) != 0;
146 mp_opt
->ack64
= (flags
& MPTCP_DSS_ACK64
) != 0;
147 mp_opt
->use_ack
= (flags
& MPTCP_DSS_HAS_ACK
);
149 pr_debug("data_fin=%d dsn64=%d use_map=%d ack64=%d use_ack=%d",
150 mp_opt
->data_fin
, mp_opt
->dsn64
,
151 mp_opt
->use_map
, mp_opt
->ack64
,
154 expected_opsize
= TCPOLEN_MPTCP_DSS_BASE
;
156 if (mp_opt
->use_ack
) {
158 expected_opsize
+= TCPOLEN_MPTCP_DSS_ACK64
;
160 expected_opsize
+= TCPOLEN_MPTCP_DSS_ACK32
;
163 if (mp_opt
->use_map
) {
165 expected_opsize
+= TCPOLEN_MPTCP_DSS_MAP64
;
167 expected_opsize
+= TCPOLEN_MPTCP_DSS_MAP32
;
170 /* RFC 6824, Section 3.3:
171 * If a checksum is present, but its use had
172 * not been negotiated in the MP_CAPABLE handshake,
173 * the checksum field MUST be ignored.
175 if (opsize
!= expected_opsize
&&
176 opsize
!= expected_opsize
+ TCPOLEN_MPTCP_DSS_CHECKSUM
)
181 if (mp_opt
->use_ack
) {
183 mp_opt
->data_ack
= get_unaligned_be64(ptr
);
186 mp_opt
->data_ack
= get_unaligned_be32(ptr
);
190 pr_debug("data_ack=%llu", mp_opt
->data_ack
);
193 if (mp_opt
->use_map
) {
195 mp_opt
->data_seq
= get_unaligned_be64(ptr
);
198 mp_opt
->data_seq
= get_unaligned_be32(ptr
);
202 mp_opt
->subflow_seq
= get_unaligned_be32(ptr
);
205 mp_opt
->data_len
= get_unaligned_be16(ptr
);
208 pr_debug("data_seq=%llu subflow_seq=%u data_len=%u",
209 mp_opt
->data_seq
, mp_opt
->subflow_seq
,
215 case MPTCPOPT_ADD_ADDR
:
216 mp_opt
->echo
= (*ptr
++) & MPTCP_ADDR_ECHO
;
218 if (opsize
== TCPOLEN_MPTCP_ADD_ADDR
||
219 opsize
== TCPOLEN_MPTCP_ADD_ADDR_PORT
)
220 mp_opt
->family
= MPTCP_ADDR_IPVERSION_4
;
221 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
222 else if (opsize
== TCPOLEN_MPTCP_ADD_ADDR6
||
223 opsize
== TCPOLEN_MPTCP_ADD_ADDR6_PORT
)
224 mp_opt
->family
= MPTCP_ADDR_IPVERSION_6
;
229 if (opsize
== TCPOLEN_MPTCP_ADD_ADDR_BASE
||
230 opsize
== TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT
)
231 mp_opt
->family
= MPTCP_ADDR_IPVERSION_4
;
232 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
233 else if (opsize
== TCPOLEN_MPTCP_ADD_ADDR6_BASE
||
234 opsize
== TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT
)
235 mp_opt
->family
= MPTCP_ADDR_IPVERSION_6
;
241 mp_opt
->add_addr
= 1;
243 mp_opt
->addr_id
= *ptr
++;
244 pr_debug("ADD_ADDR: id=%d", mp_opt
->addr_id
);
245 if (mp_opt
->family
== MPTCP_ADDR_IPVERSION_4
) {
246 memcpy((u8
*)&mp_opt
->addr
.s_addr
, (u8
*)ptr
, 4);
248 if (opsize
== TCPOLEN_MPTCP_ADD_ADDR_PORT
||
249 opsize
== TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT
) {
250 mp_opt
->port
= get_unaligned_be16(ptr
);
254 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
256 memcpy(mp_opt
->addr6
.s6_addr
, (u8
*)ptr
, 16);
258 if (opsize
== TCPOLEN_MPTCP_ADD_ADDR6_PORT
||
259 opsize
== TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT
) {
260 mp_opt
->port
= get_unaligned_be16(ptr
);
266 mp_opt
->ahmac
= get_unaligned_be64(ptr
);
271 case MPTCPOPT_RM_ADDR
:
272 if (opsize
!= TCPOLEN_MPTCP_RM_ADDR_BASE
)
276 mp_opt
->rm_id
= *ptr
++;
277 pr_debug("RM_ADDR: id=%d", mp_opt
->rm_id
);
285 void mptcp_get_options(const struct sk_buff
*skb
,
286 struct tcp_options_received
*opt_rx
)
288 const unsigned char *ptr
;
289 const struct tcphdr
*th
= tcp_hdr(skb
);
290 int length
= (th
->doff
* 4) - sizeof(struct tcphdr
);
292 ptr
= (const unsigned char *)(th
+ 1);
301 case TCPOPT_NOP
: /* Ref: RFC 793 section 3.1 */
306 if (opsize
< 2) /* "silly options" */
309 return; /* don't parse partial options */
310 if (opcode
== TCPOPT_MPTCP
)
311 mptcp_parse_option(skb
, ptr
, opsize
, opt_rx
);
318 bool mptcp_syn_options(struct sock
*sk
, const struct sk_buff
*skb
,
319 unsigned int *size
, struct mptcp_out_options
*opts
)
321 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
323 /* we will use snd_isn to detect first pkt [re]transmission
324 * in mptcp_established_options_mp()
326 subflow
->snd_isn
= TCP_SKB_CB(skb
)->end_seq
;
327 if (subflow
->request_mptcp
) {
328 pr_debug("local_key=%llu", subflow
->local_key
);
329 opts
->suboptions
= OPTION_MPTCP_MPC_SYN
;
330 opts
->sndr_key
= subflow
->local_key
;
331 *size
= TCPOLEN_MPTCP_MPC_SYN
;
333 } else if (subflow
->request_join
) {
334 pr_debug("remote_token=%u, nonce=%u", subflow
->remote_token
,
335 subflow
->local_nonce
);
336 opts
->suboptions
= OPTION_MPTCP_MPJ_SYN
;
337 opts
->join_id
= subflow
->local_id
;
338 opts
->token
= subflow
->remote_token
;
339 opts
->nonce
= subflow
->local_nonce
;
340 opts
->backup
= subflow
->request_bkup
;
341 *size
= TCPOLEN_MPTCP_MPJ_SYN
;
347 void mptcp_rcv_synsent(struct sock
*sk
)
349 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
350 struct tcp_sock
*tp
= tcp_sk(sk
);
352 if (subflow
->request_mptcp
&& tp
->rx_opt
.mptcp
.mp_capable
) {
353 subflow
->mp_capable
= 1;
354 subflow
->can_ack
= 1;
355 subflow
->remote_key
= tp
->rx_opt
.mptcp
.sndr_key
;
356 pr_debug("subflow=%p, remote_key=%llu", subflow
,
357 subflow
->remote_key
);
358 } else if (subflow
->request_join
&& tp
->rx_opt
.mptcp
.mp_join
) {
359 subflow
->mp_join
= 1;
360 subflow
->thmac
= tp
->rx_opt
.mptcp
.thmac
;
361 subflow
->remote_nonce
= tp
->rx_opt
.mptcp
.nonce
;
362 pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow
,
363 subflow
->thmac
, subflow
->remote_nonce
);
364 } else if (subflow
->request_mptcp
) {
365 tcp_sk(sk
)->is_mptcp
= 0;
369 /* MP_JOIN client subflow must wait for 4th ack before sending any data:
370 * TCP can't schedule delack timer before the subflow is fully established.
371 * MPTCP uses the delack timer to do 3rd ack retransmissions
373 static void schedule_3rdack_retransmission(struct sock
*sk
)
375 struct inet_connection_sock
*icsk
= inet_csk(sk
);
376 struct tcp_sock
*tp
= tcp_sk(sk
);
377 unsigned long timeout
;
379 /* reschedule with a timeout above RTT, as we must look only for drop */
381 timeout
= tp
->srtt_us
<< 1;
383 timeout
= TCP_TIMEOUT_INIT
;
385 WARN_ON_ONCE(icsk
->icsk_ack
.pending
& ICSK_ACK_TIMER
);
386 icsk
->icsk_ack
.pending
|= ICSK_ACK_SCHED
| ICSK_ACK_TIMER
;
387 icsk
->icsk_ack
.timeout
= timeout
;
388 sk_reset_timer(sk
, &icsk
->icsk_delack_timer
, timeout
);
391 static void clear_3rdack_retransmission(struct sock
*sk
)
393 struct inet_connection_sock
*icsk
= inet_csk(sk
);
395 sk_stop_timer(sk
, &icsk
->icsk_delack_timer
);
396 icsk
->icsk_ack
.timeout
= 0;
397 icsk
->icsk_ack
.ato
= 0;
398 icsk
->icsk_ack
.pending
&= ~(ICSK_ACK_SCHED
| ICSK_ACK_TIMER
);
401 static bool mptcp_established_options_mp(struct sock
*sk
, struct sk_buff
*skb
,
403 unsigned int remaining
,
404 struct mptcp_out_options
*opts
)
406 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
407 struct mptcp_ext
*mpext
;
408 unsigned int data_len
;
410 /* When skb is not available, we better over-estimate the emitted
411 * options len. A full DSS option (28 bytes) is longer than
412 * TCPOLEN_MPTCP_MPC_ACK_DATA(22) or TCPOLEN_MPTCP_MPJ_ACK(24), so
413 * tell the caller to defer the estimate to
414 * mptcp_established_options_dss(), which will reserve enough space.
419 /* MPC/MPJ needed only on 3rd ack packet */
420 if (subflow
->fully_established
||
421 subflow
->snd_isn
!= TCP_SKB_CB(skb
)->seq
)
424 if (subflow
->mp_capable
) {
425 mpext
= mptcp_get_ext(skb
);
426 data_len
= mpext
? mpext
->data_len
: 0;
428 /* we will check ext_copy.data_len in mptcp_write_options() to
429 * discriminate between TCPOLEN_MPTCP_MPC_ACK_DATA and
430 * TCPOLEN_MPTCP_MPC_ACK
432 opts
->ext_copy
.data_len
= data_len
;
433 opts
->suboptions
= OPTION_MPTCP_MPC_ACK
;
434 opts
->sndr_key
= subflow
->local_key
;
435 opts
->rcvr_key
= subflow
->remote_key
;
438 * The MP_CAPABLE option is carried on the SYN, SYN/ACK, and ACK
439 * packets that start the first subflow of an MPTCP connection,
440 * as well as the first packet that carries data
443 *size
= ALIGN(TCPOLEN_MPTCP_MPC_ACK_DATA
, 4);
445 *size
= TCPOLEN_MPTCP_MPC_ACK
;
447 pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d",
448 subflow
, subflow
->local_key
, subflow
->remote_key
,
452 } else if (subflow
->mp_join
) {
453 opts
->suboptions
= OPTION_MPTCP_MPJ_ACK
;
454 memcpy(opts
->hmac
, subflow
->hmac
, MPTCPOPT_HMAC_LEN
);
455 *size
= TCPOLEN_MPTCP_MPJ_ACK
;
456 pr_debug("subflow=%p", subflow
);
458 schedule_3rdack_retransmission(sk
);
464 static void mptcp_write_data_fin(struct mptcp_subflow_context
*subflow
,
465 struct mptcp_ext
*ext
)
468 /* RFC6824 requires a DSS mapping with specific values
469 * if DATA_FIN is set but no data payload is mapped
474 ext
->data_seq
= subflow
->data_fin_tx_seq
;
475 ext
->subflow_seq
= 0;
477 } else if (ext
->data_seq
+ ext
->data_len
== subflow
->data_fin_tx_seq
) {
478 /* If there's an existing DSS mapping and it is the
479 * final mapping, DATA_FIN consumes 1 additional byte of
487 static bool mptcp_established_options_dss(struct sock
*sk
, struct sk_buff
*skb
,
489 unsigned int remaining
,
490 struct mptcp_out_options
*opts
)
492 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
493 unsigned int dss_size
= 0;
494 struct mptcp_ext
*mpext
;
495 struct mptcp_sock
*msk
;
496 unsigned int ack_size
;
501 mpext
= mptcp_get_ext(skb
);
502 tcp_fin
= TCP_SKB_CB(skb
)->tcp_flags
& TCPHDR_FIN
;
508 if (!skb
|| (mpext
&& mpext
->use_map
) || tcp_fin
) {
509 unsigned int map_size
;
511 map_size
= TCPOLEN_MPTCP_DSS_BASE
+ TCPOLEN_MPTCP_DSS_MAP64
;
513 remaining
-= map_size
;
516 opts
->ext_copy
= *mpext
;
518 if (skb
&& tcp_fin
&& subflow
->data_fin_tx_enable
)
519 mptcp_write_data_fin(subflow
, &opts
->ext_copy
);
523 /* passive sockets msk will set the 'can_ack' after accept(), even
524 * if the first subflow may have the already the remote key handy
526 opts
->ext_copy
.use_ack
= 0;
527 msk
= mptcp_sk(subflow
->conn
);
528 if (!READ_ONCE(msk
->can_ack
)) {
529 *size
= ALIGN(dss_size
, 4);
533 ack_size
= TCPOLEN_MPTCP_DSS_ACK64
;
535 /* Add kind/length/subtype/flag overhead if mapping is not populated */
537 ack_size
+= TCPOLEN_MPTCP_DSS_BASE
;
539 dss_size
+= ack_size
;
541 opts
->ext_copy
.data_ack
= msk
->ack_seq
;
542 opts
->ext_copy
.ack64
= 1;
543 opts
->ext_copy
.use_ack
= 1;
545 *size
= ALIGN(dss_size
, 4);
549 static u64
add_addr_generate_hmac(u64 key1
, u64 key2
, u8 addr_id
,
550 struct in_addr
*addr
)
552 u8 hmac
[MPTCP_ADDR_HMAC_LEN
];
556 memcpy(&msg
[1], &addr
->s_addr
, 4);
560 mptcp_crypto_hmac_sha(key1
, key2
, msg
, 7, hmac
);
562 return get_unaligned_be64(hmac
);
565 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
566 static u64
add_addr6_generate_hmac(u64 key1
, u64 key2
, u8 addr_id
,
567 struct in6_addr
*addr
)
569 u8 hmac
[MPTCP_ADDR_HMAC_LEN
];
573 memcpy(&msg
[1], &addr
->s6_addr
, 16);
577 mptcp_crypto_hmac_sha(key1
, key2
, msg
, 19, hmac
);
579 return get_unaligned_be64(hmac
);
583 static bool mptcp_established_options_addr(struct sock
*sk
,
585 unsigned int remaining
,
586 struct mptcp_out_options
*opts
)
588 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
589 struct mptcp_sock
*msk
= mptcp_sk(subflow
->conn
);
590 struct mptcp_addr_info saddr
;
593 if (!mptcp_pm_should_signal(msk
) ||
594 !(mptcp_pm_addr_signal(msk
, remaining
, &saddr
)))
597 len
= mptcp_add_addr_len(saddr
.family
);
602 opts
->addr_id
= saddr
.id
;
603 if (saddr
.family
== AF_INET
) {
604 opts
->suboptions
|= OPTION_MPTCP_ADD_ADDR
;
605 opts
->addr
= saddr
.addr
;
606 opts
->ahmac
= add_addr_generate_hmac(msk
->local_key
,
611 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
612 else if (saddr
.family
== AF_INET6
) {
613 opts
->suboptions
|= OPTION_MPTCP_ADD_ADDR6
;
614 opts
->addr6
= saddr
.addr6
;
615 opts
->ahmac
= add_addr6_generate_hmac(msk
->local_key
,
621 pr_debug("addr_id=%d, ahmac=%llu", opts
->addr_id
, opts
->ahmac
);
626 bool mptcp_established_options(struct sock
*sk
, struct sk_buff
*skb
,
627 unsigned int *size
, unsigned int remaining
,
628 struct mptcp_out_options
*opts
)
630 unsigned int opt_size
= 0;
633 opts
->suboptions
= 0;
635 if (mptcp_established_options_mp(sk
, skb
, &opt_size
, remaining
, opts
))
637 else if (mptcp_established_options_dss(sk
, skb
, &opt_size
, remaining
,
641 /* we reserved enough space for the above options, and exceeding the
642 * TCP option space would be fatal
644 if (WARN_ON_ONCE(opt_size
> remaining
))
648 remaining
-= opt_size
;
649 if (mptcp_established_options_addr(sk
, &opt_size
, remaining
, opts
)) {
651 remaining
-= opt_size
;
658 bool mptcp_synack_options(const struct request_sock
*req
, unsigned int *size
,
659 struct mptcp_out_options
*opts
)
661 struct mptcp_subflow_request_sock
*subflow_req
= mptcp_subflow_rsk(req
);
663 if (subflow_req
->mp_capable
) {
664 opts
->suboptions
= OPTION_MPTCP_MPC_SYNACK
;
665 opts
->sndr_key
= subflow_req
->local_key
;
666 *size
= TCPOLEN_MPTCP_MPC_SYNACK
;
667 pr_debug("subflow_req=%p, local_key=%llu",
668 subflow_req
, subflow_req
->local_key
);
670 } else if (subflow_req
->mp_join
) {
671 opts
->suboptions
= OPTION_MPTCP_MPJ_SYNACK
;
672 opts
->backup
= subflow_req
->backup
;
673 opts
->join_id
= subflow_req
->local_id
;
674 opts
->thmac
= subflow_req
->thmac
;
675 opts
->nonce
= subflow_req
->local_nonce
;
676 pr_debug("req=%p, bkup=%u, id=%u, thmac=%llu, nonce=%u",
677 subflow_req
, opts
->backup
, opts
->join_id
,
678 opts
->thmac
, opts
->nonce
);
679 *size
= TCPOLEN_MPTCP_MPJ_SYNACK
;
685 static bool check_fully_established(struct mptcp_sock
*msk
, struct sock
*sk
,
686 struct mptcp_subflow_context
*subflow
,
688 struct mptcp_options_received
*mp_opt
)
690 /* here we can process OoO, in-window pkts, only in-sequence 4th ack
691 * will make the subflow fully established
693 if (likely(subflow
->fully_established
)) {
694 /* on passive sockets, check for 3rd ack retransmission
695 * note that msk is always set by subflow_syn_recv_sock()
696 * for mp_join subflows
698 if (TCP_SKB_CB(skb
)->seq
== subflow
->ssn_offset
+ 1 &&
699 TCP_SKB_CB(skb
)->end_seq
== TCP_SKB_CB(skb
)->seq
&&
700 subflow
->mp_join
&& mp_opt
->mp_join
&&
701 READ_ONCE(msk
->pm
.server_side
))
703 goto fully_established
;
706 /* we should process OoO packets before the first subflow is fully
707 * established, but not expected for MP_JOIN subflows
709 if (TCP_SKB_CB(skb
)->seq
!= subflow
->ssn_offset
+ 1)
710 return subflow
->mp_capable
;
712 if (mp_opt
->use_ack
) {
713 /* subflows are fully established as soon as we get any
716 subflow
->fully_established
= 1;
717 goto fully_established
;
720 WARN_ON_ONCE(subflow
->can_ack
);
722 /* If the first established packet does not contain MP_CAPABLE + data
723 * then fallback to TCP
725 if (!mp_opt
->mp_capable
) {
726 subflow
->mp_capable
= 0;
727 tcp_sk(sk
)->is_mptcp
= 0;
731 subflow
->fully_established
= 1;
732 subflow
->remote_key
= mp_opt
->sndr_key
;
733 subflow
->can_ack
= 1;
736 if (likely(subflow
->pm_notified
))
739 subflow
->pm_notified
= 1;
740 if (subflow
->mp_join
) {
741 clear_3rdack_retransmission(sk
);
742 mptcp_pm_subflow_established(msk
, subflow
);
744 mptcp_pm_fully_established(msk
);
749 static u64
expand_ack(u64 old_ack
, u64 cur_ack
, bool use_64bit
)
751 u32 old_ack32
, cur_ack32
;
756 old_ack32
= (u32
)old_ack
;
757 cur_ack32
= (u32
)cur_ack
;
758 cur_ack
= (old_ack
& GENMASK_ULL(63, 32)) + cur_ack32
;
759 if (unlikely(before(cur_ack32
, old_ack32
)))
760 return cur_ack
+ (1LL << 32);
764 static void update_una(struct mptcp_sock
*msk
,
765 struct mptcp_options_received
*mp_opt
)
767 u64 new_snd_una
, snd_una
, old_snd_una
= atomic64_read(&msk
->snd_una
);
768 u64 write_seq
= READ_ONCE(msk
->write_seq
);
770 /* avoid ack expansion on update conflict, to reduce the risk of
771 * wrongly expanding to a future ack sequence number, which is way
772 * more dangerous than missing an ack
774 new_snd_una
= expand_ack(old_snd_una
, mp_opt
->data_ack
, mp_opt
->ack64
);
776 /* ACK for data not even sent yet? Ignore. */
777 if (after64(new_snd_una
, write_seq
))
778 new_snd_una
= old_snd_una
;
780 while (after64(new_snd_una
, old_snd_una
)) {
781 snd_una
= old_snd_una
;
782 old_snd_una
= atomic64_cmpxchg(&msk
->snd_una
, snd_una
,
784 if (old_snd_una
== snd_una
) {
785 mptcp_data_acked((struct sock
*)msk
);
791 static bool add_addr_hmac_valid(struct mptcp_sock
*msk
,
792 struct mptcp_options_received
*mp_opt
)
799 if (mp_opt
->family
== MPTCP_ADDR_IPVERSION_4
)
800 hmac
= add_addr_generate_hmac(msk
->remote_key
,
802 mp_opt
->addr_id
, &mp_opt
->addr
);
803 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
805 hmac
= add_addr6_generate_hmac(msk
->remote_key
,
807 mp_opt
->addr_id
, &mp_opt
->addr6
);
810 pr_debug("msk=%p, ahmac=%llu, mp_opt->ahmac=%llu\n",
811 msk
, (unsigned long long)hmac
,
812 (unsigned long long)mp_opt
->ahmac
);
814 return hmac
== mp_opt
->ahmac
;
817 void mptcp_incoming_options(struct sock
*sk
, struct sk_buff
*skb
,
818 struct tcp_options_received
*opt_rx
)
820 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
821 struct mptcp_sock
*msk
= mptcp_sk(subflow
->conn
);
822 struct mptcp_options_received
*mp_opt
;
823 struct mptcp_ext
*mpext
;
825 mp_opt
= &opt_rx
->mptcp
;
826 if (!check_fully_established(msk
, sk
, subflow
, skb
, mp_opt
))
829 if (mp_opt
->add_addr
&& add_addr_hmac_valid(msk
, mp_opt
)) {
830 struct mptcp_addr_info addr
;
832 addr
.port
= htons(mp_opt
->port
);
833 addr
.id
= mp_opt
->addr_id
;
834 if (mp_opt
->family
== MPTCP_ADDR_IPVERSION_4
) {
835 addr
.family
= AF_INET
;
836 addr
.addr
= mp_opt
->addr
;
838 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
839 else if (mp_opt
->family
== MPTCP_ADDR_IPVERSION_6
) {
840 addr
.family
= AF_INET6
;
841 addr
.addr6
= mp_opt
->addr6
;
845 mptcp_pm_add_addr_received(msk
, &addr
);
846 mp_opt
->add_addr
= 0;
852 /* we can't wait for recvmsg() to update the ack_seq, otherwise
853 * monodirectional flows will stuck
856 update_una(msk
, mp_opt
);
858 mpext
= skb_ext_add(skb
, SKB_EXT_MPTCP
);
862 memset(mpext
, 0, sizeof(*mpext
));
864 if (mp_opt
->use_map
) {
865 if (mp_opt
->mpc_map
) {
866 /* this is an MP_CAPABLE carrying MPTCP data
867 * we know this map the first chunk of data
869 mptcp_crypto_key_sha(subflow
->remote_key
, NULL
,
872 mpext
->subflow_seq
= 1;
876 mpext
->data_seq
= mp_opt
->data_seq
;
877 mpext
->subflow_seq
= mp_opt
->subflow_seq
;
878 mpext
->dsn64
= mp_opt
->dsn64
;
879 mpext
->data_fin
= mp_opt
->data_fin
;
881 mpext
->data_len
= mp_opt
->data_len
;
886 void mptcp_write_options(__be32
*ptr
, struct mptcp_out_options
*opts
)
888 if ((OPTION_MPTCP_MPC_SYN
| OPTION_MPTCP_MPC_SYNACK
|
889 OPTION_MPTCP_MPC_ACK
) & opts
->suboptions
) {
892 if (OPTION_MPTCP_MPC_SYN
& opts
->suboptions
)
893 len
= TCPOLEN_MPTCP_MPC_SYN
;
894 else if (OPTION_MPTCP_MPC_SYNACK
& opts
->suboptions
)
895 len
= TCPOLEN_MPTCP_MPC_SYNACK
;
896 else if (opts
->ext_copy
.data_len
)
897 len
= TCPOLEN_MPTCP_MPC_ACK_DATA
;
899 len
= TCPOLEN_MPTCP_MPC_ACK
;
901 *ptr
++ = mptcp_option(MPTCPOPT_MP_CAPABLE
, len
,
902 MPTCP_SUPPORTED_VERSION
,
903 MPTCP_CAP_HMAC_SHA256
);
905 if (!((OPTION_MPTCP_MPC_SYNACK
| OPTION_MPTCP_MPC_ACK
) &
907 goto mp_capable_done
;
909 put_unaligned_be64(opts
->sndr_key
, ptr
);
911 if (!((OPTION_MPTCP_MPC_ACK
) & opts
->suboptions
))
912 goto mp_capable_done
;
914 put_unaligned_be64(opts
->rcvr_key
, ptr
);
916 if (!opts
->ext_copy
.data_len
)
917 goto mp_capable_done
;
919 put_unaligned_be32(opts
->ext_copy
.data_len
<< 16 |
920 TCPOPT_NOP
<< 8 | TCPOPT_NOP
, ptr
);
925 if (OPTION_MPTCP_ADD_ADDR
& opts
->suboptions
) {
927 *ptr
++ = mptcp_option(MPTCPOPT_ADD_ADDR
,
928 TCPOLEN_MPTCP_ADD_ADDR
, 0,
931 *ptr
++ = mptcp_option(MPTCPOPT_ADD_ADDR
,
932 TCPOLEN_MPTCP_ADD_ADDR_BASE
,
935 memcpy((u8
*)ptr
, (u8
*)&opts
->addr
.s_addr
, 4);
938 put_unaligned_be64(opts
->ahmac
, ptr
);
943 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
944 if (OPTION_MPTCP_ADD_ADDR6
& opts
->suboptions
) {
946 *ptr
++ = mptcp_option(MPTCPOPT_ADD_ADDR
,
947 TCPOLEN_MPTCP_ADD_ADDR6
, 0,
950 *ptr
++ = mptcp_option(MPTCPOPT_ADD_ADDR
,
951 TCPOLEN_MPTCP_ADD_ADDR6_BASE
,
954 memcpy((u8
*)ptr
, opts
->addr6
.s6_addr
, 16);
957 put_unaligned_be64(opts
->ahmac
, ptr
);
963 if (OPTION_MPTCP_RM_ADDR
& opts
->suboptions
) {
964 *ptr
++ = mptcp_option(MPTCPOPT_RM_ADDR
,
965 TCPOLEN_MPTCP_RM_ADDR_BASE
,
969 if (OPTION_MPTCP_MPJ_SYN
& opts
->suboptions
) {
970 *ptr
++ = mptcp_option(MPTCPOPT_MP_JOIN
,
971 TCPOLEN_MPTCP_MPJ_SYN
,
972 opts
->backup
, opts
->join_id
);
973 put_unaligned_be32(opts
->token
, ptr
);
975 put_unaligned_be32(opts
->nonce
, ptr
);
979 if (OPTION_MPTCP_MPJ_SYNACK
& opts
->suboptions
) {
980 *ptr
++ = mptcp_option(MPTCPOPT_MP_JOIN
,
981 TCPOLEN_MPTCP_MPJ_SYNACK
,
982 opts
->backup
, opts
->join_id
);
983 put_unaligned_be64(opts
->thmac
, ptr
);
985 put_unaligned_be32(opts
->nonce
, ptr
);
989 if (OPTION_MPTCP_MPJ_ACK
& opts
->suboptions
) {
990 *ptr
++ = mptcp_option(MPTCPOPT_MP_JOIN
,
991 TCPOLEN_MPTCP_MPJ_ACK
, 0, 0);
992 memcpy(ptr
, opts
->hmac
, MPTCPOPT_HMAC_LEN
);
996 if (opts
->ext_copy
.use_ack
|| opts
->ext_copy
.use_map
) {
997 struct mptcp_ext
*mpext
= &opts
->ext_copy
;
998 u8 len
= TCPOLEN_MPTCP_DSS_BASE
;
1001 if (mpext
->use_ack
) {
1002 len
+= TCPOLEN_MPTCP_DSS_ACK64
;
1003 flags
= MPTCP_DSS_HAS_ACK
| MPTCP_DSS_ACK64
;
1006 if (mpext
->use_map
) {
1007 len
+= TCPOLEN_MPTCP_DSS_MAP64
;
1009 /* Use only 64-bit mapping flags for now, add
1010 * support for optional 32-bit mappings later.
1012 flags
|= MPTCP_DSS_HAS_MAP
| MPTCP_DSS_DSN64
;
1013 if (mpext
->data_fin
)
1014 flags
|= MPTCP_DSS_DATA_FIN
;
1017 *ptr
++ = mptcp_option(MPTCPOPT_DSS
, len
, 0, flags
);
1019 if (mpext
->use_ack
) {
1020 put_unaligned_be64(mpext
->data_ack
, ptr
);
1024 if (mpext
->use_map
) {
1025 put_unaligned_be64(mpext
->data_seq
, ptr
);
1027 put_unaligned_be32(mpext
->subflow_seq
, ptr
);
1029 put_unaligned_be32(mpext
->data_len
<< 16 |
1030 TCPOPT_NOP
<< 8 | TCPOPT_NOP
, ptr
);