mptcp: allow collapsing consecutive sendpages on the same substream

If the current sendmsg() lands on the same subflow we used last, we
can try to collapse the data.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Christoph Paasch <cpaasch@apple.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Paolo Abeni 2020-01-21 16:56:27 -08:00 committed by David S. Miller
parent 7a6a6cbc3e
commit 57040755a3
1 changed files with 61 additions and 16 deletions

View File

@ -122,14 +122,27 @@ static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk)
return NULL; return NULL;
} }
static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, static inline bool mptcp_skb_can_collapse_to(const struct mptcp_sock *msk,
struct msghdr *msg, long *timeo) const struct sk_buff *skb,
const struct mptcp_ext *mpext)
{ {
int mss_now = 0, size_goal = 0, ret = 0; if (!tcp_skb_can_collapse_to(skb))
return false;
/* can collapse only if MPTCP level sequence is in order */
return mpext && mpext->data_seq + mpext->data_len == msk->write_seq;
}
static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
struct msghdr *msg, long *timeo, int *pmss_now,
int *ps_goal)
{
int mss_now, avail_size, size_goal, ret;
struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_ext *mpext = NULL; struct mptcp_ext *mpext = NULL;
struct sk_buff *skb, *tail;
bool can_collapse = false;
struct page_frag *pfrag; struct page_frag *pfrag;
struct sk_buff *skb;
size_t psize; size_t psize;
/* use the mptcp page cache so that we can easily move the data /* use the mptcp page cache so that we can easily move the data
@ -145,8 +158,29 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
/* compute copy limit */ /* compute copy limit */
mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags); mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags);
psize = min_t(int, pfrag->size - pfrag->offset, size_goal); *pmss_now = mss_now;
*ps_goal = size_goal;
avail_size = size_goal;
skb = tcp_write_queue_tail(ssk);
if (skb) {
mpext = skb_ext_find(skb, SKB_EXT_MPTCP);
/* Limit the write to the size available in the
* current skb, if any, so that we create at most a new skb.
* Explicitly tells TCP internals to avoid collapsing on later
* queue management operation, to avoid breaking the ext <->
* SSN association set here
*/
can_collapse = (size_goal - skb->len > 0) &&
mptcp_skb_can_collapse_to(msk, skb, mpext);
if (!can_collapse)
TCP_SKB_CB(skb)->eor = 1;
else
avail_size = size_goal - skb->len;
}
psize = min_t(size_t, pfrag->size - pfrag->offset, avail_size);
/* Copy to page */
pr_debug("left=%zu", msg_data_left(msg)); pr_debug("left=%zu", msg_data_left(msg));
psize = copy_page_from_iter(pfrag->page, pfrag->offset, psize = copy_page_from_iter(pfrag->page, pfrag->offset,
min_t(size_t, msg_data_left(msg), psize), min_t(size_t, msg_data_left(msg), psize),
@ -155,14 +189,9 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
if (!psize) if (!psize)
return -EINVAL; return -EINVAL;
/* Mark the end of the previous write so the beginning of the /* tell the TCP stack to delay the push so that we can safely
* next write (with its own mptcp skb extension data) is not * access the skb after the sendpages call
* collapsed.
*/ */
skb = tcp_write_queue_tail(ssk);
if (skb)
TCP_SKB_CB(skb)->eor = 1;
ret = do_tcp_sendpages(ssk, pfrag->page, pfrag->offset, psize, ret = do_tcp_sendpages(ssk, pfrag->page, pfrag->offset, psize,
msg->msg_flags | MSG_SENDPAGE_NOTLAST); msg->msg_flags | MSG_SENDPAGE_NOTLAST);
if (ret <= 0) if (ret <= 0)
@ -170,6 +199,18 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
if (unlikely(ret < psize)) if (unlikely(ret < psize))
iov_iter_revert(&msg->msg_iter, psize - ret); iov_iter_revert(&msg->msg_iter, psize - ret);
/* if the tail skb extension is still the cached one, collapsing
* really happened. Note: we can't check for 'same skb' as the sk_buff
* hdr on tail can be transmitted, freed and re-allocated by the
* do_tcp_sendpages() call
*/
tail = tcp_write_queue_tail(ssk);
if (mpext && tail && mpext == skb_ext_find(tail, SKB_EXT_MPTCP)) {
WARN_ON_ONCE(!can_collapse);
mpext->data_len += ret;
goto out;
}
skb = tcp_write_queue_tail(ssk); skb = tcp_write_queue_tail(ssk);
mpext = __skb_ext_set(skb, SKB_EXT_MPTCP, msk->cached_ext); mpext = __skb_ext_set(skb, SKB_EXT_MPTCP, msk->cached_ext);
msk->cached_ext = NULL; msk->cached_ext = NULL;
@ -185,11 +226,11 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
mpext->data_seq, mpext->subflow_seq, mpext->data_len, mpext->data_seq, mpext->subflow_seq, mpext->data_len,
mpext->dsn64); mpext->dsn64);
out:
pfrag->offset += ret; pfrag->offset += ret;
msk->write_seq += ret; msk->write_seq += ret;
mptcp_subflow_ctx(ssk)->rel_write_seq += ret; mptcp_subflow_ctx(ssk)->rel_write_seq += ret;
tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle, size_goal);
return ret; return ret;
} }
@ -212,11 +253,11 @@ static void ssk_check_wmem(struct mptcp_sock *msk, struct sock *ssk)
static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{ {
int mss_now = 0, size_goal = 0, ret = 0;
struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_sock *msk = mptcp_sk(sk);
struct socket *ssock; struct socket *ssock;
size_t copied = 0; size_t copied = 0;
struct sock *ssk; struct sock *ssk;
int ret = 0;
long timeo; long timeo;
if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL))
@ -243,15 +284,19 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
lock_sock(ssk); lock_sock(ssk);
while (msg_data_left(msg)) { while (msg_data_left(msg)) {
ret = mptcp_sendmsg_frag(sk, ssk, msg, &timeo); ret = mptcp_sendmsg_frag(sk, ssk, msg, &timeo, &mss_now,
&size_goal);
if (ret < 0) if (ret < 0)
break; break;
copied += ret; copied += ret;
} }
if (copied > 0) if (copied) {
ret = copied; ret = copied;
tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle,
size_goal);
}
ssk_check_wmem(msk, ssk); ssk_check_wmem(msk, ssk);
release_sock(ssk); release_sock(ssk);