mptcp: rework poll+nospace handling
MPTCP maintains a status bit, MPTCP_SEND_SPACE, that is set when at least one subflow and the mptcp socket itself are writeable. mptcp_poll returns EPOLLOUT if the bit is set. mptcp_sendmsg makes sure MPTCP_SEND_SPACE gets cleared when last write has used up all subflows or the mptcp socket wmem. This reworks nospace handling as follows: MPTCP_SEND_SPACE is replaced with MPTCP_NOSPACE, i.e. inverted meaning. This bit is set when the mptcp socket is not writeable. The mptcp-level ack path schedule will then schedule the mptcp worker to allow it to free already-acked data (and reduce wmem usage). This will then wake userspace processes that wait for a POLLOUT event. sendmsg will set MPTCP_NOSPACE only when it has to wait for more wmem (blocking I/O case). poll path will set MPTCP_NOSPACE in case the mptcp socket is not writeable. Normal tcp-level notification (SOCK_NOSPACE) is only enabled in case the subflow socket has no available wmem. Signed-off-by: Florian Westphal <fw@strlen.de> Signed-off-by: Paolo Abeni <pabeni@redhat.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
parent
813e0a683d
commit
8edf08649e
|
@ -724,7 +724,7 @@ void mptcp_data_acked(struct sock *sk)
|
||||||
{
|
{
|
||||||
mptcp_reset_timer(sk);
|
mptcp_reset_timer(sk);
|
||||||
|
|
||||||
if ((!test_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags) ||
|
if ((test_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags) ||
|
||||||
mptcp_send_head(sk) ||
|
mptcp_send_head(sk) ||
|
||||||
(inet_sk_state_load(sk) != TCP_ESTABLISHED)))
|
(inet_sk_state_load(sk) != TCP_ESTABLISHED)))
|
||||||
mptcp_schedule_work(sk);
|
mptcp_schedule_work(sk);
|
||||||
|
@ -835,20 +835,6 @@ static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag)
|
||||||
put_page(dfrag->page);
|
put_page(dfrag->page);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool mptcp_is_writeable(struct mptcp_sock *msk)
|
|
||||||
{
|
|
||||||
struct mptcp_subflow_context *subflow;
|
|
||||||
|
|
||||||
if (!sk_stream_is_writeable((struct sock *)msk))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
mptcp_for_each_subflow(msk, subflow) {
|
|
||||||
if (sk_stream_is_writeable(subflow->tcp_sock))
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void mptcp_clean_una(struct sock *sk)
|
static void mptcp_clean_una(struct sock *sk)
|
||||||
{
|
{
|
||||||
struct mptcp_sock *msk = mptcp_sk(sk);
|
struct mptcp_sock *msk = mptcp_sk(sk);
|
||||||
|
@ -901,13 +887,8 @@ static void mptcp_clean_una_wakeup(struct sock *sk)
|
||||||
mptcp_clean_una(sk);
|
mptcp_clean_una(sk);
|
||||||
|
|
||||||
/* Only wake up writers if a subflow is ready */
|
/* Only wake up writers if a subflow is ready */
|
||||||
if (mptcp_is_writeable(msk)) {
|
if (sk_stream_is_writeable(sk)) {
|
||||||
set_bit(MPTCP_SEND_SPACE, &msk->flags);
|
clear_bit(MPTCP_NOSPACE, &msk->flags);
|
||||||
smp_mb__after_atomic();
|
|
||||||
|
|
||||||
/* set SEND_SPACE before sk_stream_write_space clears
|
|
||||||
* NOSPACE
|
|
||||||
*/
|
|
||||||
sk_stream_write_space(sk);
|
sk_stream_write_space(sk);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1041,17 +1022,25 @@ static void mptcp_nospace(struct mptcp_sock *msk)
|
||||||
{
|
{
|
||||||
struct mptcp_subflow_context *subflow;
|
struct mptcp_subflow_context *subflow;
|
||||||
|
|
||||||
clear_bit(MPTCP_SEND_SPACE, &msk->flags);
|
set_bit(MPTCP_NOSPACE, &msk->flags);
|
||||||
smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */
|
smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */
|
||||||
|
|
||||||
mptcp_for_each_subflow(msk, subflow) {
|
mptcp_for_each_subflow(msk, subflow) {
|
||||||
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
|
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
|
||||||
|
bool ssk_writeable = sk_stream_is_writeable(ssk);
|
||||||
struct socket *sock = READ_ONCE(ssk->sk_socket);
|
struct socket *sock = READ_ONCE(ssk->sk_socket);
|
||||||
|
|
||||||
|
if (ssk_writeable || !sock)
|
||||||
|
continue;
|
||||||
|
|
||||||
/* enables ssk->write_space() callbacks */
|
/* enables ssk->write_space() callbacks */
|
||||||
if (sock)
|
set_bit(SOCK_NOSPACE, &sock->flags);
|
||||||
set_bit(SOCK_NOSPACE, &sock->flags);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* mptcp_data_acked() could run just before we set the NOSPACE bit,
|
||||||
|
* so explicitly check for snd_una value
|
||||||
|
*/
|
||||||
|
mptcp_clean_una((struct sock *)msk);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
|
static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
|
||||||
|
@ -1155,12 +1144,6 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ssk_check_wmem(struct mptcp_sock *msk)
|
|
||||||
{
|
|
||||||
if (unlikely(!mptcp_is_writeable(msk)))
|
|
||||||
mptcp_nospace(msk);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void mptcp_push_release(struct sock *sk, struct sock *ssk,
|
static void mptcp_push_release(struct sock *sk, struct sock *ssk,
|
||||||
struct mptcp_sendmsg_info *info)
|
struct mptcp_sendmsg_info *info)
|
||||||
{
|
{
|
||||||
|
@ -1332,7 +1315,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
|
||||||
|
|
||||||
wait_for_memory:
|
wait_for_memory:
|
||||||
mptcp_nospace(msk);
|
mptcp_nospace(msk);
|
||||||
mptcp_clean_una(sk);
|
|
||||||
if (mptcp_timer_pending(sk))
|
if (mptcp_timer_pending(sk))
|
||||||
mptcp_reset_timer(sk);
|
mptcp_reset_timer(sk);
|
||||||
ret = sk_stream_wait_memory(sk, &timeo);
|
ret = sk_stream_wait_memory(sk, &timeo);
|
||||||
|
@ -1344,7 +1326,6 @@ wait_for_memory:
|
||||||
mptcp_push_pending(sk, msg->msg_flags);
|
mptcp_push_pending(sk, msg->msg_flags);
|
||||||
|
|
||||||
out:
|
out:
|
||||||
ssk_check_wmem(msk);
|
|
||||||
release_sock(sk);
|
release_sock(sk);
|
||||||
return copied ? : ret;
|
return copied ? : ret;
|
||||||
}
|
}
|
||||||
|
@ -1921,7 +1902,6 @@ static int __mptcp_init_sock(struct sock *sk)
|
||||||
INIT_LIST_HEAD(&msk->conn_list);
|
INIT_LIST_HEAD(&msk->conn_list);
|
||||||
INIT_LIST_HEAD(&msk->join_list);
|
INIT_LIST_HEAD(&msk->join_list);
|
||||||
INIT_LIST_HEAD(&msk->rtx_queue);
|
INIT_LIST_HEAD(&msk->rtx_queue);
|
||||||
__set_bit(MPTCP_SEND_SPACE, &msk->flags);
|
|
||||||
INIT_WORK(&msk->work, mptcp_worker);
|
INIT_WORK(&msk->work, mptcp_worker);
|
||||||
msk->out_of_order_queue = RB_ROOT;
|
msk->out_of_order_queue = RB_ROOT;
|
||||||
msk->first_pending = NULL;
|
msk->first_pending = NULL;
|
||||||
|
@ -2619,13 +2599,6 @@ bool mptcp_finish_join(struct sock *ssk)
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool mptcp_memory_free(const struct sock *sk, int wake)
|
|
||||||
{
|
|
||||||
struct mptcp_sock *msk = mptcp_sk(sk);
|
|
||||||
|
|
||||||
return wake ? test_bit(MPTCP_SEND_SPACE, &msk->flags) : true;
|
|
||||||
}
|
|
||||||
|
|
||||||
static struct proto mptcp_prot = {
|
static struct proto mptcp_prot = {
|
||||||
.name = "MPTCP",
|
.name = "MPTCP",
|
||||||
.owner = THIS_MODULE,
|
.owner = THIS_MODULE,
|
||||||
|
@ -2646,7 +2619,6 @@ static struct proto mptcp_prot = {
|
||||||
.sockets_allocated = &mptcp_sockets_allocated,
|
.sockets_allocated = &mptcp_sockets_allocated,
|
||||||
.memory_allocated = &tcp_memory_allocated,
|
.memory_allocated = &tcp_memory_allocated,
|
||||||
.memory_pressure = &tcp_memory_pressure,
|
.memory_pressure = &tcp_memory_pressure,
|
||||||
.stream_memory_free = mptcp_memory_free,
|
|
||||||
.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
|
.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
|
||||||
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
|
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
|
||||||
.sysctl_mem = sysctl_tcp_mem,
|
.sysctl_mem = sysctl_tcp_mem,
|
||||||
|
@ -2820,6 +2792,39 @@ static __poll_t mptcp_check_readable(struct mptcp_sock *msk)
|
||||||
0;
|
0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool __mptcp_check_writeable(struct mptcp_sock *msk)
|
||||||
|
{
|
||||||
|
struct sock *sk = (struct sock *)msk;
|
||||||
|
bool mptcp_writable;
|
||||||
|
|
||||||
|
mptcp_clean_una(sk);
|
||||||
|
mptcp_writable = sk_stream_is_writeable(sk);
|
||||||
|
if (!mptcp_writable)
|
||||||
|
mptcp_nospace(msk);
|
||||||
|
|
||||||
|
return mptcp_writable;
|
||||||
|
}
|
||||||
|
|
||||||
|
static __poll_t mptcp_check_writeable(struct mptcp_sock *msk)
|
||||||
|
{
|
||||||
|
struct sock *sk = (struct sock *)msk;
|
||||||
|
__poll_t ret = 0;
|
||||||
|
bool slow;
|
||||||
|
|
||||||
|
if (unlikely(sk->sk_shutdown & SEND_SHUTDOWN))
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if (sk_stream_is_writeable(sk))
|
||||||
|
return EPOLLOUT | EPOLLWRNORM;
|
||||||
|
|
||||||
|
slow = lock_sock_fast(sk);
|
||||||
|
if (__mptcp_check_writeable(msk))
|
||||||
|
ret = EPOLLOUT | EPOLLWRNORM;
|
||||||
|
|
||||||
|
unlock_sock_fast(sk, slow);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
static __poll_t mptcp_poll(struct file *file, struct socket *sock,
|
static __poll_t mptcp_poll(struct file *file, struct socket *sock,
|
||||||
struct poll_table_struct *wait)
|
struct poll_table_struct *wait)
|
||||||
{
|
{
|
||||||
|
@ -2838,8 +2843,7 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
|
||||||
|
|
||||||
if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) {
|
if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) {
|
||||||
mask |= mptcp_check_readable(msk);
|
mask |= mptcp_check_readable(msk);
|
||||||
if (test_bit(MPTCP_SEND_SPACE, &msk->flags))
|
mask |= mptcp_check_writeable(msk);
|
||||||
mask |= EPOLLOUT | EPOLLWRNORM;
|
|
||||||
}
|
}
|
||||||
if (sk->sk_shutdown & RCV_SHUTDOWN)
|
if (sk->sk_shutdown & RCV_SHUTDOWN)
|
||||||
mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
|
mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
|
||||||
|
|
|
@ -86,7 +86,7 @@
|
||||||
|
|
||||||
/* MPTCP socket flags */
|
/* MPTCP socket flags */
|
||||||
#define MPTCP_DATA_READY 0
|
#define MPTCP_DATA_READY 0
|
||||||
#define MPTCP_SEND_SPACE 1
|
#define MPTCP_NOSPACE 1
|
||||||
#define MPTCP_WORK_RTX 2
|
#define MPTCP_WORK_RTX 2
|
||||||
#define MPTCP_WORK_EOF 3
|
#define MPTCP_WORK_EOF 3
|
||||||
#define MPTCP_FALLBACK_DONE 4
|
#define MPTCP_FALLBACK_DONE 4
|
||||||
|
|
|
@ -997,17 +997,16 @@ static void subflow_data_ready(struct sock *sk)
|
||||||
static void subflow_write_space(struct sock *sk)
|
static void subflow_write_space(struct sock *sk)
|
||||||
{
|
{
|
||||||
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
|
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
|
||||||
|
struct socket *sock = READ_ONCE(sk->sk_socket);
|
||||||
struct sock *parent = subflow->conn;
|
struct sock *parent = subflow->conn;
|
||||||
|
|
||||||
if (!sk_stream_is_writeable(sk))
|
if (!sk_stream_is_writeable(sk))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
if (sk_stream_is_writeable(parent)) {
|
if (sock && sk_stream_is_writeable(parent))
|
||||||
set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags);
|
clear_bit(SOCK_NOSPACE, &sock->flags);
|
||||||
smp_mb__after_atomic();
|
|
||||||
/* set SEND_SPACE before sk_stream_write_space clears NOSPACE */
|
sk_stream_write_space(parent);
|
||||||
sk_stream_write_space(parent);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct inet_connection_sock_af_ops *
|
static struct inet_connection_sock_af_ops *
|
||||||
|
|
Loading…
Reference in New Issue