return mptcp_try_coalesce((struct sock *)msk, to, from);
}
+/* "inspired" by tcp_rcvbuf_grow(), main difference:
+ * - mptcp does not maintain a msk-level window clamp
+ * - returns true when the receive buffer is actually updated
+ */
+static bool mptcp_rcvbuf_grow(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ const struct net *net = sock_net(sk);
+ int rcvwin, rcvbuf, cap;
+
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) ||
+ (sk->sk_userlocks & SOCK_RCVBUF_LOCK))
+ return false;
+
+ rcvwin = msk->rcvq_space.space << 1;
+
+ if (!RB_EMPTY_ROOT(&msk->out_of_order_queue))
+ rcvwin += MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq - msk->ack_seq;
+
+ cap = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
+
+ rcvbuf = min_t(u32, mptcp_space_from_win(sk, rcvwin), cap);
+ if (rcvbuf > sk->sk_rcvbuf) {
+ WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
+ return true;
+ }
+ return false;
+}
+
/* "inspired" by tcp_data_queue_ofo(), main differences:
* - use mptcp seqs
* - don't cope with sacks
end:
skb_condense(skb);
skb_set_owner_r(skb, sk);
+ /* do not grow rcvbuf for not-yet-accepted or orphaned sockets. */
+ if (sk->sk_socket)
+ mptcp_rcvbuf_grow(sk);
}
static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
return moved;
}
-static void __mptcp_rcvbuf_update(struct sock *sk, struct sock *ssk)
-{
- if (unlikely(ssk->sk_rcvbuf > sk->sk_rcvbuf))
- WRITE_ONCE(sk->sk_rcvbuf, ssk->sk_rcvbuf);
-}
-
static void __mptcp_data_ready(struct sock *sk, struct sock *ssk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- __mptcp_rcvbuf_update(sk, ssk);
-
/* Wake-up the reader only for in-sequence data */
if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk))
sk->sk_data_ready(sk);
if (msk->rcvq_space.copied <= msk->rcvq_space.space)
goto new_measure;
- if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
- !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
- u64 rcvwin, grow;
- int rcvbuf;
-
- rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss;
-
- grow = rcvwin * (msk->rcvq_space.copied - msk->rcvq_space.space);
-
- do_div(grow, msk->rcvq_space.space);
- rcvwin += (grow << 1);
-
- rcvbuf = min_t(u64, mptcp_space_from_win(sk, rcvwin),
- READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
-
- if (rcvbuf > sk->sk_rcvbuf) {
- u32 window_clamp;
-
- window_clamp = mptcp_win_from_space(sk, rcvbuf);
- WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
+ msk->rcvq_space.space = msk->rcvq_space.copied;
+ if (mptcp_rcvbuf_grow(sk)) {
- /* Make subflows follow along. If we do not do this, we
- * get drops at subflow level if skbs can't be moved to
- * the mptcp rx queue fast enough (announced rcv_win can
- * exceed ssk->sk_rcvbuf).
- */
- mptcp_for_each_subflow(msk, subflow) {
- struct sock *ssk;
- bool slow;
+ /* Make subflows follow along. If we do not do this, we
+ * get drops at subflow level if skbs can't be moved to
+ * the mptcp rx queue fast enough (announced rcv_win can
+ * exceed ssk->sk_rcvbuf).
+ */
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk;
+ bool slow;
- ssk = mptcp_subflow_tcp_sock(subflow);
- slow = lock_sock_fast(ssk);
- WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf);
- WRITE_ONCE(tcp_sk(ssk)->window_clamp, window_clamp);
- if (tcp_can_send_ack(ssk))
- tcp_cleanup_rbuf(ssk, 1);
- unlock_sock_fast(ssk, slow);
- }
+ ssk = mptcp_subflow_tcp_sock(subflow);
+ slow = lock_sock_fast(ssk);
+ tcp_sk(ssk)->rcvq_space.space = msk->rcvq_space.copied;
+ tcp_rcvbuf_grow(ssk);
+ unlock_sock_fast(ssk, slow);
}
}
- msk->rcvq_space.space = msk->rcvq_space.copied;
new_measure:
msk->rcvq_space.copied = 0;
msk->rcvq_space.time = mstamp;
if (list_empty(&msk->conn_list))
return false;
- /* verify we can move any data from the subflow, eventually updating */
- if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
- mptcp_for_each_subflow(msk, subflow)
- __mptcp_rcvbuf_update(sk, subflow->tcp_sock);
-
subflow = list_first_entry(&msk->conn_list,
struct mptcp_subflow_context, node);
for (;;) {