1 // SPDX-License-Identifier: GPL-2.0
4 * Copyright (c) 2019, Tessares SA.
8 #include <linux/sysctl.h>
11 #include <net/net_namespace.h>
12 #include <net/netns/generic.h>
17 #define MPTCP_SYSCTL_PATH "net/mptcp"
19 static int mptcp_pernet_id
;
22 static int mptcp_pm_type_max
= __MPTCP_PM_TYPE_MAX
;
27 struct ctl_table_header
*ctl_table_hdr
;
30 unsigned int add_addr_timeout
;
31 unsigned int blackhole_timeout
;
32 unsigned int close_timeout
;
33 unsigned int stale_loss_cnt
;
34 atomic_t active_disable_times
;
35 u8 syn_retrans_before_tcp_fallback
;
36 unsigned long active_disable_stamp
;
39 u8 allow_join_initial_addr_port
;
41 char scheduler
[MPTCP_SCHED_NAME_MAX
];
42 char path_manager
[MPTCP_PM_NAME_MAX
];
45 static struct mptcp_pernet
*mptcp_get_pernet(const struct net
*net
)
47 return net_generic(net
, mptcp_pernet_id
);
50 int mptcp_is_enabled(const struct net
*net
)
52 return mptcp_get_pernet(net
)->mptcp_enabled
;
55 unsigned int mptcp_get_add_addr_timeout(const struct net
*net
)
57 return mptcp_get_pernet(net
)->add_addr_timeout
;
60 int mptcp_is_checksum_enabled(const struct net
*net
)
62 return mptcp_get_pernet(net
)->checksum_enabled
;
65 int mptcp_allow_join_id0(const struct net
*net
)
67 return mptcp_get_pernet(net
)->allow_join_initial_addr_port
;
70 unsigned int mptcp_stale_loss_cnt(const struct net
*net
)
72 return mptcp_get_pernet(net
)->stale_loss_cnt
;
75 unsigned int mptcp_close_timeout(const struct sock
*sk
)
77 if (sock_flag(sk
, SOCK_DEAD
))
78 return TCP_TIMEWAIT_LEN
;
79 return mptcp_get_pernet(sock_net(sk
))->close_timeout
;
82 int mptcp_get_pm_type(const struct net
*net
)
84 return mptcp_get_pernet(net
)->pm_type
;
87 const char *mptcp_get_path_manager(const struct net
*net
)
89 return mptcp_get_pernet(net
)->path_manager
;
92 const char *mptcp_get_scheduler(const struct net
*net
)
94 return mptcp_get_pernet(net
)->scheduler
;
97 static void mptcp_pernet_set_defaults(struct mptcp_pernet
*pernet
)
99 pernet
->mptcp_enabled
= 1;
100 pernet
->add_addr_timeout
= TCP_RTO_MAX
;
101 pernet
->blackhole_timeout
= 3600;
102 pernet
->syn_retrans_before_tcp_fallback
= 2;
103 atomic_set(&pernet
->active_disable_times
, 0);
104 pernet
->close_timeout
= TCP_TIMEWAIT_LEN
;
105 pernet
->checksum_enabled
= 0;
106 pernet
->allow_join_initial_addr_port
= 1;
107 pernet
->stale_loss_cnt
= 4;
108 pernet
->pm_type
= MPTCP_PM_TYPE_KERNEL
;
109 strscpy(pernet
->scheduler
, "default", sizeof(pernet
->scheduler
));
110 strscpy(pernet
->path_manager
, "kernel", sizeof(pernet
->path_manager
));
114 static int mptcp_set_scheduler(char *scheduler
, const char *name
)
116 struct mptcp_sched_ops
*sched
;
120 sched
= mptcp_sched_find(name
);
122 strscpy(scheduler
, name
, MPTCP_SCHED_NAME_MAX
);
130 static int proc_scheduler(const struct ctl_table
*ctl
, int write
,
131 void *buffer
, size_t *lenp
, loff_t
*ppos
)
133 char (*scheduler
)[MPTCP_SCHED_NAME_MAX
] = ctl
->data
;
134 char val
[MPTCP_SCHED_NAME_MAX
];
135 struct ctl_table tbl
= {
137 .maxlen
= MPTCP_SCHED_NAME_MAX
,
141 strscpy(val
, *scheduler
, MPTCP_SCHED_NAME_MAX
);
143 ret
= proc_dostring(&tbl
, write
, buffer
, lenp
, ppos
);
144 if (write
&& ret
== 0)
145 ret
= mptcp_set_scheduler(*scheduler
, val
);
150 static int proc_available_schedulers(const struct ctl_table
*ctl
,
151 int write
, void *buffer
,
152 size_t *lenp
, loff_t
*ppos
)
154 struct ctl_table tbl
= { .maxlen
= MPTCP_SCHED_BUF_MAX
, };
157 tbl
.data
= kmalloc(tbl
.maxlen
, GFP_USER
);
161 mptcp_get_available_schedulers(tbl
.data
, MPTCP_SCHED_BUF_MAX
);
162 ret
= proc_dostring(&tbl
, write
, buffer
, lenp
, ppos
);
168 static int proc_blackhole_detect_timeout(const struct ctl_table
*table
,
169 int write
, void *buffer
, size_t *lenp
,
172 struct mptcp_pernet
*pernet
= container_of(table
->data
,
177 ret
= proc_dointvec_minmax(table
, write
, buffer
, lenp
, ppos
);
178 if (write
&& ret
== 0)
179 atomic_set(&pernet
->active_disable_times
, 0);
184 static int mptcp_set_path_manager(char *path_manager
, const char *name
)
186 struct mptcp_pm_ops
*pm_ops
;
190 pm_ops
= mptcp_pm_find(name
);
192 strscpy(path_manager
, name
, MPTCP_PM_NAME_MAX
);
200 static int proc_path_manager(const struct ctl_table
*ctl
, int write
,
201 void *buffer
, size_t *lenp
, loff_t
*ppos
)
203 struct mptcp_pernet
*pernet
= container_of(ctl
->data
,
206 char (*path_manager
)[MPTCP_PM_NAME_MAX
] = ctl
->data
;
207 char pm_name
[MPTCP_PM_NAME_MAX
];
208 const struct ctl_table tbl
= {
210 .maxlen
= MPTCP_PM_NAME_MAX
,
214 strscpy(pm_name
, *path_manager
, MPTCP_PM_NAME_MAX
);
216 ret
= proc_dostring(&tbl
, write
, buffer
, lenp
, ppos
);
217 if (write
&& ret
== 0) {
218 ret
= mptcp_set_path_manager(*path_manager
, pm_name
);
220 u8 pm_type
= __MPTCP_PM_TYPE_NR
;
222 if (strncmp(pm_name
, "kernel", MPTCP_PM_NAME_MAX
) == 0)
223 pm_type
= MPTCP_PM_TYPE_KERNEL
;
224 else if (strncmp(pm_name
, "userspace", MPTCP_PM_NAME_MAX
) == 0)
225 pm_type
= MPTCP_PM_TYPE_USERSPACE
;
226 pernet
->pm_type
= pm_type
;
233 static int proc_pm_type(const struct ctl_table
*ctl
, int write
,
234 void *buffer
, size_t *lenp
, loff_t
*ppos
)
236 struct mptcp_pernet
*pernet
= container_of(ctl
->data
,
241 ret
= proc_dou8vec_minmax(ctl
, write
, buffer
, lenp
, ppos
);
242 if (write
&& ret
== 0) {
243 u8 pm_type
= READ_ONCE(*(u8
*)ctl
->data
);
246 if (pm_type
== MPTCP_PM_TYPE_KERNEL
)
248 else if (pm_type
== MPTCP_PM_TYPE_USERSPACE
)
249 pm_name
= "userspace";
250 mptcp_set_path_manager(pernet
->path_manager
, pm_name
);
256 static int proc_available_path_managers(const struct ctl_table
*ctl
,
257 int write
, void *buffer
,
258 size_t *lenp
, loff_t
*ppos
)
260 struct ctl_table tbl
= { .maxlen
= MPTCP_PM_BUF_MAX
, };
263 tbl
.data
= kmalloc(tbl
.maxlen
, GFP_USER
);
267 mptcp_pm_get_available(tbl
.data
, MPTCP_PM_BUF_MAX
);
268 ret
= proc_dostring(&tbl
, write
, buffer
, lenp
, ppos
);
274 static struct ctl_table mptcp_sysctl_table
[] = {
276 .procname
= "enabled",
277 .maxlen
= sizeof(u8
),
279 /* users with CAP_NET_ADMIN or root (not and) can change this
280 * value, same as other sysctl or the 'net' tree.
282 .proc_handler
= proc_dou8vec_minmax
,
283 .extra1
= SYSCTL_ZERO
,
287 .procname
= "add_addr_timeout",
288 .maxlen
= sizeof(unsigned int),
290 .proc_handler
= proc_dointvec_jiffies
,
293 .procname
= "checksum_enabled",
294 .maxlen
= sizeof(u8
),
296 .proc_handler
= proc_dou8vec_minmax
,
297 .extra1
= SYSCTL_ZERO
,
301 .procname
= "allow_join_initial_addr_port",
302 .maxlen
= sizeof(u8
),
304 .proc_handler
= proc_dou8vec_minmax
,
305 .extra1
= SYSCTL_ZERO
,
309 .procname
= "stale_loss_cnt",
310 .maxlen
= sizeof(unsigned int),
312 .proc_handler
= proc_douintvec_minmax
,
315 .procname
= "pm_type",
316 .maxlen
= sizeof(u8
),
318 .proc_handler
= proc_pm_type
,
319 .extra1
= SYSCTL_ZERO
,
320 .extra2
= &mptcp_pm_type_max
323 .procname
= "scheduler",
324 .maxlen
= MPTCP_SCHED_NAME_MAX
,
326 .proc_handler
= proc_scheduler
,
329 .procname
= "available_schedulers",
330 .maxlen
= MPTCP_SCHED_BUF_MAX
,
332 .proc_handler
= proc_available_schedulers
,
335 .procname
= "close_timeout",
336 .maxlen
= sizeof(unsigned int),
338 .proc_handler
= proc_dointvec_jiffies
,
341 .procname
= "blackhole_timeout",
342 .maxlen
= sizeof(unsigned int),
344 .proc_handler
= proc_blackhole_detect_timeout
,
345 .extra1
= SYSCTL_ZERO
,
348 .procname
= "syn_retrans_before_tcp_fallback",
349 .maxlen
= sizeof(u8
),
351 .proc_handler
= proc_dou8vec_minmax
,
354 .procname
= "path_manager",
355 .maxlen
= MPTCP_PM_NAME_MAX
,
357 .proc_handler
= proc_path_manager
,
360 .procname
= "available_path_managers",
361 .maxlen
= MPTCP_PM_BUF_MAX
,
363 .proc_handler
= proc_available_path_managers
,
367 static int mptcp_pernet_new_table(struct net
*net
, struct mptcp_pernet
*pernet
)
369 struct ctl_table_header
*hdr
;
370 struct ctl_table
*table
;
372 table
= mptcp_sysctl_table
;
373 if (!net_eq(net
, &init_net
)) {
374 table
= kmemdup(table
, sizeof(mptcp_sysctl_table
), GFP_KERNEL
);
379 table
[0].data
= &pernet
->mptcp_enabled
;
380 table
[1].data
= &pernet
->add_addr_timeout
;
381 table
[2].data
= &pernet
->checksum_enabled
;
382 table
[3].data
= &pernet
->allow_join_initial_addr_port
;
383 table
[4].data
= &pernet
->stale_loss_cnt
;
384 table
[5].data
= &pernet
->pm_type
;
385 table
[6].data
= &pernet
->scheduler
;
386 /* table[7] is for available_schedulers which is read-only info */
387 table
[8].data
= &pernet
->close_timeout
;
388 table
[9].data
= &pernet
->blackhole_timeout
;
389 table
[10].data
= &pernet
->syn_retrans_before_tcp_fallback
;
390 table
[11].data
= &pernet
->path_manager
;
391 /* table[12] is for available_path_managers which is read-only info */
393 hdr
= register_net_sysctl_sz(net
, MPTCP_SYSCTL_PATH
, table
,
394 ARRAY_SIZE(mptcp_sysctl_table
));
398 pernet
->ctl_table_hdr
= hdr
;
403 if (!net_eq(net
, &init_net
))
409 static void mptcp_pernet_del_table(struct mptcp_pernet
*pernet
)
411 const struct ctl_table
*table
= pernet
->ctl_table_hdr
->ctl_table_arg
;
413 unregister_net_sysctl_table(pernet
->ctl_table_hdr
);
420 static int mptcp_pernet_new_table(struct net
*net
, struct mptcp_pernet
*pernet
)
425 static void mptcp_pernet_del_table(struct mptcp_pernet
*pernet
) {}
427 #endif /* CONFIG_SYSCTL */
429 /* The following code block is to deal with middle box issues with MPTCP,
430 * similar to what is done with TFO.
431 * The proposed solution is to disable active MPTCP globally when SYN+MPC are
432 * dropped, while SYN without MPC aren't. In this case, active side MPTCP is
433 * disabled globally for 1hr at first. Then if it happens again, it is disabled
434 * for 2h, then 4h, 8h, ...
435 * The timeout is reset back to 1hr when a successful active MPTCP connection is
439 /* Disable active MPTCP and record current jiffies and active_disable_times */
440 void mptcp_active_disable(struct sock
*sk
)
442 struct net
*net
= sock_net(sk
);
443 struct mptcp_pernet
*pernet
;
445 pernet
= mptcp_get_pernet(net
);
447 if (!READ_ONCE(pernet
->blackhole_timeout
))
450 /* Paired with READ_ONCE() in mptcp_active_should_disable() */
451 WRITE_ONCE(pernet
->active_disable_stamp
, jiffies
);
453 /* Paired with smp_rmb() in mptcp_active_should_disable().
454 * We want pernet->active_disable_stamp to be updated first.
456 smp_mb__before_atomic();
457 atomic_inc(&pernet
->active_disable_times
);
459 MPTCP_INC_STATS(net
, MPTCP_MIB_BLACKHOLE
);
462 /* Calculate timeout for MPTCP active disable
463 * Return true if we are still in the active MPTCP disable period
464 * Return false if timeout already expired and we should use active MPTCP
466 bool mptcp_active_should_disable(struct sock
*ssk
)
468 struct net
*net
= sock_net(ssk
);
469 unsigned int blackhole_timeout
;
470 struct mptcp_pernet
*pernet
;
471 unsigned long timeout
;
475 pernet
= mptcp_get_pernet(net
);
476 blackhole_timeout
= READ_ONCE(pernet
->blackhole_timeout
);
478 if (!blackhole_timeout
)
481 disable_times
= atomic_read(&pernet
->active_disable_times
);
485 /* Paired with smp_mb__before_atomic() in mptcp_active_disable() */
488 /* Limit timeout to max: 2^6 * initial timeout */
489 multiplier
= 1 << min(disable_times
- 1, 6);
491 /* Paired with the WRITE_ONCE() in mptcp_active_disable(). */
492 timeout
= READ_ONCE(pernet
->active_disable_stamp
) +
493 multiplier
* blackhole_timeout
* HZ
;
495 return time_before(jiffies
, timeout
);
498 /* Enable active MPTCP and reset active_disable_times if needed */
499 void mptcp_active_enable(struct sock
*sk
)
501 struct mptcp_pernet
*pernet
= mptcp_get_pernet(sock_net(sk
));
503 if (atomic_read(&pernet
->active_disable_times
)) {
504 struct dst_entry
*dst
= sk_dst_get(sk
);
506 if (dst
&& dst
->dev
&& (dst
->dev
->flags
& IFF_LOOPBACK
))
507 atomic_set(&pernet
->active_disable_times
, 0);
511 /* Check the number of retransmissions, and fallback to TCP if needed */
512 void mptcp_active_detect_blackhole(struct sock
*ssk
, bool expired
)
514 struct mptcp_subflow_context
*subflow
;
518 /* Only check MPTCP SYN ... */
519 if (likely(!sk_is_mptcp(ssk
) || ssk
->sk_state
!= TCP_SYN_SENT
))
522 subflow
= mptcp_subflow_ctx(ssk
);
524 /* ... + MP_CAPABLE */
525 if (!subflow
->request_mptcp
) {
526 /* Mark as blackhole iif the 1st non-MPTCP SYN is accepted */
527 subflow
->mpc_drop
= 0;
532 timeouts
= inet_csk(ssk
)->icsk_retransmits
;
533 to_max
= mptcp_get_pernet(net
)->syn_retrans_before_tcp_fallback
;
535 if (timeouts
== to_max
|| (timeouts
< to_max
&& expired
)) {
536 MPTCP_INC_STATS(net
, MPTCP_MIB_MPCAPABLEACTIVEDROP
);
537 subflow
->mpc_drop
= 1;
538 mptcp_subflow_early_fallback(mptcp_sk(subflow
->conn
), subflow
);
542 static int __net_init
mptcp_net_init(struct net
*net
)
544 struct mptcp_pernet
*pernet
= mptcp_get_pernet(net
);
546 mptcp_pernet_set_defaults(pernet
);
548 return mptcp_pernet_new_table(net
, pernet
);
551 /* Note: the callback will only be called per extra netns */
552 static void __net_exit
mptcp_net_exit(struct net
*net
)
554 struct mptcp_pernet
*pernet
= mptcp_get_pernet(net
);
556 mptcp_pernet_del_table(pernet
);
559 static struct pernet_operations mptcp_pernet_ops
= {
560 .init
= mptcp_net_init
,
561 .exit
= mptcp_net_exit
,
562 .id
= &mptcp_pernet_id
,
563 .size
= sizeof(struct mptcp_pernet
),
566 void __init
mptcp_init(void)
568 mptcp_join_cookie_init();
571 if (register_pernet_subsys(&mptcp_pernet_ops
) < 0)
572 panic("Failed to register MPTCP pernet subsystem.\n");
575 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
576 int __init
mptcpv6_init(void)
580 err
= mptcp_proto_v6_init();