]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
6.6-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 8 Apr 2024 11:40:57 +0000 (13:40 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 8 Apr 2024 11:40:57 +0000 (13:40 +0200)
added patches:
bpf-put-uprobe-link-s-path-and-task-in-release-callback.patch
bpf-support-deferring-bpf_link-dealloc-to-after-rcu-grace-period.patch
mptcp-don-t-account-accept-of-non-mpc-client-as-fallback-to-tcp.patch
mptcp-don-t-overwrite-sock_ops-in-mptcp_is_tcpsk.patch

queue-6.6/bpf-put-uprobe-link-s-path-and-task-in-release-callback.patch [new file with mode: 0644]
queue-6.6/bpf-support-deferring-bpf_link-dealloc-to-after-rcu-grace-period.patch [new file with mode: 0644]
queue-6.6/mptcp-don-t-account-accept-of-non-mpc-client-as-fallback-to-tcp.patch [new file with mode: 0644]
queue-6.6/mptcp-don-t-overwrite-sock_ops-in-mptcp_is_tcpsk.patch [new file with mode: 0644]
queue-6.6/series

diff --git a/queue-6.6/bpf-put-uprobe-link-s-path-and-task-in-release-callback.patch b/queue-6.6/bpf-put-uprobe-link-s-path-and-task-in-release-callback.patch
new file mode 100644 (file)
index 0000000..d60d2b3
--- /dev/null
@@ -0,0 +1,51 @@
+From e9c856cabefb71d47b2eeb197f72c9c88e9b45b0 Mon Sep 17 00:00:00 2001
+From: Andrii Nakryiko <andrii@kernel.org>
+Date: Wed, 27 Mar 2024 22:24:25 -0700
+Subject: bpf: put uprobe link's path and task in release callback
+
+From: Andrii Nakryiko <andrii@kernel.org>
+
+commit e9c856cabefb71d47b2eeb197f72c9c88e9b45b0 upstream.
+
+There is no need to delay putting either path or task to deallocation
+step. It can be done right after bpf_uprobe_unregister. Between release
+and dealloc, there could be still some running BPF programs, but they
+don't access either task or path, only data in link->uprobes, so it is
+safe to do.
+
+On the other hand, doing path_put() in dealloc callback makes this
+dealloc sleepable because path_put() itself might sleep. Which is
+problematic due to the need to call uprobe's dealloc through call_rcu(),
+which is what is done in the next bug fix patch. So solve the problem by
+releasing these resources early.
+
+Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
+Link: https://lore.kernel.org/r/20240328052426.3042617-1-andrii@kernel.org
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/trace/bpf_trace.c |    6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/kernel/trace/bpf_trace.c
++++ b/kernel/trace/bpf_trace.c
+@@ -3065,6 +3065,9 @@ static void bpf_uprobe_multi_link_releas
+       umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
+       bpf_uprobe_unregister(&umulti_link->path, umulti_link->uprobes, umulti_link->cnt);
++      if (umulti_link->task)
++              put_task_struct(umulti_link->task);
++      path_put(&umulti_link->path);
+ }
+ static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link)
+@@ -3072,9 +3075,6 @@ static void bpf_uprobe_multi_link_deallo
+       struct bpf_uprobe_multi_link *umulti_link;
+       umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
+-      if (umulti_link->task)
+-              put_task_struct(umulti_link->task);
+-      path_put(&umulti_link->path);
+       kvfree(umulti_link->uprobes);
+       kfree(umulti_link);
+ }
diff --git a/queue-6.6/bpf-support-deferring-bpf_link-dealloc-to-after-rcu-grace-period.patch b/queue-6.6/bpf-support-deferring-bpf_link-dealloc-to-after-rcu-grace-period.patch
new file mode 100644 (file)
index 0000000..e3017aa
--- /dev/null
@@ -0,0 +1,162 @@
+From 1a80dbcb2dbaf6e4c216e62e30fa7d3daa8001ce Mon Sep 17 00:00:00 2001
+From: Andrii Nakryiko <andrii@kernel.org>
+Date: Wed, 27 Mar 2024 22:24:26 -0700
+Subject: bpf: support deferring bpf_link dealloc to after RCU grace period
+
+From: Andrii Nakryiko <andrii@kernel.org>
+
+commit 1a80dbcb2dbaf6e4c216e62e30fa7d3daa8001ce upstream.
+
+BPF link for some program types is passed as a "context" which can be
+used by those BPF programs to look up additional information. E.g., for
+multi-kprobes and multi-uprobes, link is used to fetch BPF cookie values.
+
+Because of this runtime dependency, when bpf_link refcnt drops to zero
+there could still be active BPF programs running accessing link data.
+
+This patch adds generic support to defer bpf_link dealloc callback to
+after RCU GP, if requested. This is done by exposing two different
+deallocation callbacks, one synchronous and one deferred. If deferred
+one is provided, bpf_link_free() will schedule dealloc_deferred()
+callback to happen after RCU GP.
+
+BPF is using two flavors of RCU: "classic" non-sleepable one and RCU
+tasks trace one. The latter is used when sleepable BPF programs are
+used. bpf_link_free() accommodates that by checking underlying BPF
+program's sleepable flag, and goes either through normal RCU GP only for
+non-sleepable, or through RCU tasks trace GP *and* then normal RCU GP
+(taking into account rcu_trace_implies_rcu_gp() optimization), if BPF
+program is sleepable.
+
+We use this for multi-kprobe and multi-uprobe links, which dereference
+link during program run. We also preventively switch raw_tp link to use
+deferred dealloc callback, as upcoming changes in bpf-next tree expose
+raw_tp link data (specifically, cookie value) to BPF program at runtime
+as well.
+
+Fixes: 0dcac2725406 ("bpf: Add multi kprobe link")
+Fixes: 89ae89f53d20 ("bpf: Add multi uprobe link")
+Reported-by: syzbot+981935d9485a560bfbcb@syzkaller.appspotmail.com
+Reported-by: syzbot+2cb5a6c573e98db598cc@syzkaller.appspotmail.com
+Reported-by: syzbot+62d8b26793e8a2bd0516@syzkaller.appspotmail.com
+Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
+Acked-by: Jiri Olsa <jolsa@kernel.org>
+Link: https://lore.kernel.org/r/20240328052426.3042617-2-andrii@kernel.org
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/bpf.h      |   16 +++++++++++++++-
+ kernel/bpf/syscall.c     |   35 ++++++++++++++++++++++++++++++++---
+ kernel/trace/bpf_trace.c |    4 ++--
+ 3 files changed, 49 insertions(+), 6 deletions(-)
+
+--- a/include/linux/bpf.h
++++ b/include/linux/bpf.h
+@@ -1524,12 +1524,26 @@ struct bpf_link {
+       enum bpf_link_type type;
+       const struct bpf_link_ops *ops;
+       struct bpf_prog *prog;
+-      struct work_struct work;
++      /* rcu is used before freeing, work can be used to schedule that
++       * RCU-based freeing before that, so they never overlap
++       */
++      union {
++              struct rcu_head rcu;
++              struct work_struct work;
++      };
+ };
+ struct bpf_link_ops {
+       void (*release)(struct bpf_link *link);
++      /* deallocate link resources callback, called without RCU grace period
++       * waiting
++       */
+       void (*dealloc)(struct bpf_link *link);
++      /* deallocate link resources callback, called after RCU grace period;
++       * if underlying BPF program is sleepable we go through tasks trace
++       * RCU GP and then "classic" RCU GP
++       */
++      void (*dealloc_deferred)(struct bpf_link *link);
+       int (*detach)(struct bpf_link *link);
+       int (*update_prog)(struct bpf_link *link, struct bpf_prog *new_prog,
+                          struct bpf_prog *old_prog);
+--- a/kernel/bpf/syscall.c
++++ b/kernel/bpf/syscall.c
+@@ -2866,17 +2866,46 @@ void bpf_link_inc(struct bpf_link *link)
+       atomic64_inc(&link->refcnt);
+ }
++static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu)
++{
++      struct bpf_link *link = container_of(rcu, struct bpf_link, rcu);
++
++      /* free bpf_link and its containing memory */
++      link->ops->dealloc_deferred(link);
++}
++
++static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu)
++{
++      if (rcu_trace_implies_rcu_gp())
++              bpf_link_defer_dealloc_rcu_gp(rcu);
++      else
++              call_rcu(rcu, bpf_link_defer_dealloc_rcu_gp);
++}
++
+ /* bpf_link_free is guaranteed to be called from process context */
+ static void bpf_link_free(struct bpf_link *link)
+ {
++      bool sleepable = false;
++
+       bpf_link_free_id(link->id);
+       if (link->prog) {
++              sleepable = link->prog->aux->sleepable;
+               /* detach BPF program, clean up used resources */
+               link->ops->release(link);
+               bpf_prog_put(link->prog);
+       }
+-      /* free bpf_link and its containing memory */
+-      link->ops->dealloc(link);
++      if (link->ops->dealloc_deferred) {
++              /* schedule BPF link deallocation; if underlying BPF program
++               * is sleepable, we need to first wait for RCU tasks trace
++               * sync, then go through "classic" RCU grace period
++               */
++              if (sleepable)
++                      call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp);
++              else
++                      call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp);
++      }
++      if (link->ops->dealloc)
++              link->ops->dealloc(link);
+ }
+ static void bpf_link_put_deferred(struct work_struct *work)
+@@ -3381,7 +3410,7 @@ static int bpf_raw_tp_link_fill_link_inf
+ static const struct bpf_link_ops bpf_raw_tp_link_lops = {
+       .release = bpf_raw_tp_link_release,
+-      .dealloc = bpf_raw_tp_link_dealloc,
++      .dealloc_deferred = bpf_raw_tp_link_dealloc,
+       .show_fdinfo = bpf_raw_tp_link_show_fdinfo,
+       .fill_link_info = bpf_raw_tp_link_fill_link_info,
+ };
+--- a/kernel/trace/bpf_trace.c
++++ b/kernel/trace/bpf_trace.c
+@@ -2639,7 +2639,7 @@ static int bpf_kprobe_multi_link_fill_li
+ static const struct bpf_link_ops bpf_kprobe_multi_link_lops = {
+       .release = bpf_kprobe_multi_link_release,
+-      .dealloc = bpf_kprobe_multi_link_dealloc,
++      .dealloc_deferred = bpf_kprobe_multi_link_dealloc,
+       .fill_link_info = bpf_kprobe_multi_link_fill_link_info,
+ };
+@@ -3081,7 +3081,7 @@ static void bpf_uprobe_multi_link_deallo
+ static const struct bpf_link_ops bpf_uprobe_multi_link_lops = {
+       .release = bpf_uprobe_multi_link_release,
+-      .dealloc = bpf_uprobe_multi_link_dealloc,
++      .dealloc_deferred = bpf_uprobe_multi_link_dealloc,
+ };
+ static int uprobe_prog_run(struct bpf_uprobe *uprobe,
diff --git a/queue-6.6/mptcp-don-t-account-accept-of-non-mpc-client-as-fallback-to-tcp.patch b/queue-6.6/mptcp-don-t-account-accept-of-non-mpc-client-as-fallback-to-tcp.patch
new file mode 100644 (file)
index 0000000..fd53f0f
--- /dev/null
@@ -0,0 +1,104 @@
+From 7a1b3490f47e88ec4cbde65f1a77a0f4bc972282 Mon Sep 17 00:00:00 2001
+From: Davide Caratti <dcaratti@redhat.com>
+Date: Fri, 29 Mar 2024 13:08:52 +0100
+Subject: mptcp: don't account accept() of non-MPC client as fallback to TCP
+
+From: Davide Caratti <dcaratti@redhat.com>
+
+commit 7a1b3490f47e88ec4cbde65f1a77a0f4bc972282 upstream.
+
+Current MPTCP servers increment MPTcpExtMPCapableFallbackACK when they
+accept non-MPC connections. As reported by Christoph, this is "surprising"
+because the counter might become greater than MPTcpExtMPCapableSYNRX.
+
+MPTcpExtMPCapableFallbackACK counter's name suggests it should only be
+incremented when a connection was seen using MPTCP options, then a
+fallback to TCP has been done. Let's do that by incrementing it when
+the subflow context of an inbound MPC connection attempt is dropped.
+Also, update mptcp_connect.sh kselftest, to ensure that the
+above MIB does not increment in case a pure TCP client connects to a
+MPTCP server.
+
+Fixes: fc518953bc9c ("mptcp: add and use MIB counter infrastructure")
+Cc: stable@vger.kernel.org
+Reported-by: Christoph Paasch <cpaasch@apple.com>
+Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/449
+Signed-off-by: Davide Caratti <dcaratti@redhat.com>
+Reviewed-by: Mat Martineau <martineau@kernel.org>
+Reviewed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Link: https://lore.kernel.org/r/20240329-upstream-net-20240329-fallback-mib-v1-1-324a8981da48@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/mptcp/protocol.c                               |    2 --
+ net/mptcp/subflow.c                                |    2 ++
+ tools/testing/selftests/net/mptcp/mptcp_connect.sh |    9 +++++++++
+ 3 files changed, 11 insertions(+), 2 deletions(-)
+
+--- a/net/mptcp/protocol.c
++++ b/net/mptcp/protocol.c
+@@ -3911,8 +3911,6 @@ static int mptcp_stream_accept(struct so
+                               mptcp_set_state(newsk, TCP_CLOSE);
+               }
+       } else {
+-              MPTCP_INC_STATS(sock_net(ssk),
+-                              MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK);
+ tcpfallback:
+               newsk->sk_kern_sock = kern;
+               lock_sock(newsk);
+--- a/net/mptcp/subflow.c
++++ b/net/mptcp/subflow.c
+@@ -905,6 +905,8 @@ dispose_child:
+       return child;
+ fallback:
++      if (fallback)
++              SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK);
+       mptcp_subflow_drop_ctx(child);
+       return child;
+ }
+--- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh
++++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
+@@ -433,12 +433,14 @@ do_transfer()
+       local stat_cookierx_last
+       local stat_csum_err_s
+       local stat_csum_err_c
++      local stat_tcpfb_last_l
+       stat_synrx_last_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableSYNRX")
+       stat_ackrx_last_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableACKRX")
+       stat_cookietx_last=$(mptcp_lib_get_counter "${listener_ns}" "TcpExtSyncookiesSent")
+       stat_cookierx_last=$(mptcp_lib_get_counter "${listener_ns}" "TcpExtSyncookiesRecv")
+       stat_csum_err_s=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtDataCsumErr")
+       stat_csum_err_c=$(mptcp_lib_get_counter "${connector_ns}" "MPTcpExtDataCsumErr")
++      stat_tcpfb_last_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableFallbackACK")
+       timeout ${timeout_test} \
+               ip netns exec ${listener_ns} \
+@@ -506,11 +508,13 @@ do_transfer()
+       local stat_cookietx_now
+       local stat_cookierx_now
+       local stat_ooo_now
++      local stat_tcpfb_now_l
+       stat_synrx_now_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableSYNRX")
+       stat_ackrx_now_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableACKRX")
+       stat_cookietx_now=$(mptcp_lib_get_counter "${listener_ns}" "TcpExtSyncookiesSent")
+       stat_cookierx_now=$(mptcp_lib_get_counter "${listener_ns}" "TcpExtSyncookiesRecv")
+       stat_ooo_now=$(mptcp_lib_get_counter "${listener_ns}" "TcpExtTCPOFOQueue")
++      stat_tcpfb_now_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableFallbackACK")
+       expect_synrx=$((stat_synrx_last_l))
+       expect_ackrx=$((stat_ackrx_last_l))
+@@ -564,6 +568,11 @@ do_transfer()
+               mptcp_lib_result_fail "${TEST_GROUP}: ${result_msg}"
+       fi
++      if [ ${stat_ooo_now} -eq 0 ] && [ ${stat_tcpfb_last_l} -ne ${stat_tcpfb_now_l} ]; then
++              mptcp_lib_pr_fail "unexpected fallback to TCP"
++              rets=1
++      fi
++
+       if [ $cookies -eq 2 ];then
+               if [ $stat_cookietx_last -ge $stat_cookietx_now ] ;then
+                       printf " WARN: CookieSent: did not advance"
diff --git a/queue-6.6/mptcp-don-t-overwrite-sock_ops-in-mptcp_is_tcpsk.patch b/queue-6.6/mptcp-don-t-overwrite-sock_ops-in-mptcp_is_tcpsk.patch
new file mode 100644 (file)
index 0000000..4bc6bfe
--- /dev/null
@@ -0,0 +1,187 @@
+From 8e2b8a9fa512709e6fee744dcd4e2a20ee7f5c56 Mon Sep 17 00:00:00 2001
+From: Davide Caratti <dcaratti@redhat.com>
+Date: Tue, 19 Dec 2023 22:31:04 +0100
+Subject: mptcp: don't overwrite sock_ops in mptcp_is_tcpsk()
+
+From: Davide Caratti <dcaratti@redhat.com>
+
+commit 8e2b8a9fa512709e6fee744dcd4e2a20ee7f5c56 upstream.
+
+Eric Dumazet suggests:
+
+ > The fact that mptcp_is_tcpsk() was able to write over sock->ops was a
+ > bit strange to me.
+ > mptcp_is_tcpsk() should answer a question, with a read-only argument.
+
+re-factor code to avoid overwriting sock_ops inside that function. Also,
+change the helper name to reflect the semantics and to disambiguate from
+its dual, sk_is_mptcp(). While at it, collapse mptcp_stream_accept() and
+mptcp_accept() into a single function, where fallback / non-fallback are
+separated into a single sk_is_mptcp() conditional.
+
+Link: https://github.com/multipath-tcp/mptcp_net-next/issues/432
+Suggested-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: Davide Caratti <dcaratti@redhat.com>
+Acked-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Matthieu Baerts <matttbe@kernel.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/mptcp/protocol.c |  108 ++++++++++++++++++++-------------------------------
+ 1 file changed, 44 insertions(+), 64 deletions(-)
+
+--- a/net/mptcp/protocol.c
++++ b/net/mptcp/protocol.c
+@@ -55,28 +55,14 @@ static u64 mptcp_wnd_end(const struct mp
+       return READ_ONCE(msk->wnd_end);
+ }
+-static bool mptcp_is_tcpsk(struct sock *sk)
++static const struct proto_ops *mptcp_fallback_tcp_ops(const struct sock *sk)
+ {
+-      struct socket *sock = sk->sk_socket;
+-
+-      if (unlikely(sk->sk_prot == &tcp_prot)) {
+-              /* we are being invoked after mptcp_accept() has
+-               * accepted a non-mp-capable flow: sk is a tcp_sk,
+-               * not an mptcp one.
+-               *
+-               * Hand the socket over to tcp so all further socket ops
+-               * bypass mptcp.
+-               */
+-              WRITE_ONCE(sock->ops, &inet_stream_ops);
+-              return true;
+ #if IS_ENABLED(CONFIG_MPTCP_IPV6)
+-      } else if (unlikely(sk->sk_prot == &tcpv6_prot)) {
+-              WRITE_ONCE(sock->ops, &inet6_stream_ops);
+-              return true;
++      if (sk->sk_prot == &tcpv6_prot)
++              return &inet6_stream_ops;
+ #endif
+-      }
+-
+-      return false;
++      WARN_ON_ONCE(sk->sk_prot != &tcp_prot);
++      return &inet_stream_ops;
+ }
+ static int __mptcp_socket_create(struct mptcp_sock *msk)
+@@ -3328,44 +3314,6 @@ void mptcp_rcv_space_init(struct mptcp_s
+               msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT;
+ }
+-static struct sock *mptcp_accept(struct sock *ssk, int flags, int *err,
+-                               bool kern)
+-{
+-      struct sock *newsk;
+-
+-      pr_debug("ssk=%p, listener=%p", ssk, mptcp_subflow_ctx(ssk));
+-      newsk = inet_csk_accept(ssk, flags, err, kern);
+-      if (!newsk)
+-              return NULL;
+-
+-      pr_debug("newsk=%p, subflow is mptcp=%d", newsk, sk_is_mptcp(newsk));
+-      if (sk_is_mptcp(newsk)) {
+-              struct mptcp_subflow_context *subflow;
+-              struct sock *new_mptcp_sock;
+-
+-              subflow = mptcp_subflow_ctx(newsk);
+-              new_mptcp_sock = subflow->conn;
+-
+-              /* is_mptcp should be false if subflow->conn is missing, see
+-               * subflow_syn_recv_sock()
+-               */
+-              if (WARN_ON_ONCE(!new_mptcp_sock)) {
+-                      tcp_sk(newsk)->is_mptcp = 0;
+-                      goto out;
+-              }
+-
+-              newsk = new_mptcp_sock;
+-              MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEPASSIVEACK);
+-      } else {
+-              MPTCP_INC_STATS(sock_net(ssk),
+-                              MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK);
+-      }
+-
+-out:
+-      newsk->sk_kern_sock = kern;
+-      return newsk;
+-}
+-
+ void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags)
+ {
+       struct mptcp_subflow_context *subflow, *tmp;
+@@ -3802,7 +3750,6 @@ static struct proto mptcp_prot = {
+       .connect        = mptcp_connect,
+       .disconnect     = mptcp_disconnect,
+       .close          = mptcp_close,
+-      .accept         = mptcp_accept,
+       .setsockopt     = mptcp_setsockopt,
+       .getsockopt     = mptcp_getsockopt,
+       .shutdown       = mptcp_shutdown,
+@@ -3912,18 +3859,36 @@ static int mptcp_stream_accept(struct so
+       if (!ssk)
+               return -EINVAL;
+-      newsk = mptcp_accept(ssk, flags, &err, kern);
++      pr_debug("ssk=%p, listener=%p", ssk, mptcp_subflow_ctx(ssk));
++      newsk = inet_csk_accept(ssk, flags, &err, kern);
+       if (!newsk)
+               return err;
+-      lock_sock(newsk);
+-
+-      __inet_accept(sock, newsock, newsk);
+-      if (!mptcp_is_tcpsk(newsock->sk)) {
+-              struct mptcp_sock *msk = mptcp_sk(newsk);
++      pr_debug("newsk=%p, subflow is mptcp=%d", newsk, sk_is_mptcp(newsk));
++      if (sk_is_mptcp(newsk)) {
+               struct mptcp_subflow_context *subflow;
++              struct sock *new_mptcp_sock;
++
++              subflow = mptcp_subflow_ctx(newsk);
++              new_mptcp_sock = subflow->conn;
++
++              /* is_mptcp should be false if subflow->conn is missing, see
++               * subflow_syn_recv_sock()
++               */
++              if (WARN_ON_ONCE(!new_mptcp_sock)) {
++                      tcp_sk(newsk)->is_mptcp = 0;
++                      goto tcpfallback;
++              }
++
++              newsk = new_mptcp_sock;
++              MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEPASSIVEACK);
++
++              newsk->sk_kern_sock = kern;
++              lock_sock(newsk);
++              __inet_accept(sock, newsock, newsk);
+               set_bit(SOCK_CUSTOM_SOCKOPT, &newsock->flags);
++              msk = mptcp_sk(newsk);
+               msk->in_accept_queue = 0;
+               /* set ssk->sk_socket of accept()ed flows to mptcp socket.
+@@ -3945,6 +3910,21 @@ static int mptcp_stream_accept(struct so
+                       if (unlikely(list_is_singular(&msk->conn_list)))
+                               mptcp_set_state(newsk, TCP_CLOSE);
+               }
++      } else {
++              MPTCP_INC_STATS(sock_net(ssk),
++                              MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK);
++tcpfallback:
++              newsk->sk_kern_sock = kern;
++              lock_sock(newsk);
++              __inet_accept(sock, newsock, newsk);
++              /* we are being invoked after accepting a non-mp-capable
++               * flow: sk is a tcp_sk, not an mptcp one.
++               *
++               * Hand the socket over to tcp so all further socket ops
++               * bypass mptcp.
++               */
++              WRITE_ONCE(newsock->sk->sk_socket->ops,
++                         mptcp_fallback_tcp_ops(newsock->sk));
+       }
+       release_sock(newsk);
index 378f9558120e1f5dd53b15e8275e754f9a191543..75be9db30062c62d7732ebb66647b70110948b8b 100644 (file)
@@ -241,3 +241,7 @@ drm-i915-gt-enable-only-one-ccs-for-compute-workload.patch
 revert-x86-mpparse-register-apic-address-only-once.patch
 of-module-prevent-null-pointer-dereference-in-vsnprintf.patch
 selftests-mptcp-connect-fix-shellcheck-warnings.patch
+mptcp-don-t-overwrite-sock_ops-in-mptcp_is_tcpsk.patch
+mptcp-don-t-account-accept-of-non-mpc-client-as-fallback-to-tcp.patch
+bpf-put-uprobe-link-s-path-and-task-in-release-callback.patch
+bpf-support-deferring-bpf_link-dealloc-to-after-rcu-grace-period.patch