mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 02:30:34 +02:00 
			
		
		
		
	mptcp: rework poll+nospace handling
MPTCP maintains a status bit, MPTCP_SEND_SPACE, that is set when at least one subflow and the mptcp socket itself are writeable. mptcp_poll returns EPOLLOUT if the bit is set. mptcp_sendmsg makes sure MPTCP_SEND_SPACE gets cleared when last write has used up all subflows or the mptcp socket wmem. This reworks nospace handling as follows: MPTCP_SEND_SPACE is replaced with MPTCP_NOSPACE, i.e. inverted meaning. This bit is set when the mptcp socket is not writeable. The mptcp-level ack path schedule will then schedule the mptcp worker to allow it to free already-acked data (and reduce wmem usage). This will then wake userspace processes that wait for a POLLOUT event. sendmsg will set MPTCP_NOSPACE only when it has to wait for more wmem (blocking I/O case). poll path will set MPTCP_NOSPACE in case the mptcp socket is not writeable. Normal tcp-level notification (SOCK_NOSPACE) is only enabled in case the subflow socket has no available wmem. Signed-off-by: Florian Westphal <fw@strlen.de> Signed-off-by: Paolo Abeni <pabeni@redhat.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
		
							parent
							
								
									813e0a683d
								
							
						
					
					
						commit
						8edf08649e
					
				
					 3 changed files with 54 additions and 51 deletions
				
			
		| 
						 | 
				
			
			@ -724,7 +724,7 @@ void mptcp_data_acked(struct sock *sk)
 | 
			
		|||
{
 | 
			
		||||
	mptcp_reset_timer(sk);
 | 
			
		||||
 | 
			
		||||
	if ((!test_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags) ||
 | 
			
		||||
	if ((test_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags) ||
 | 
			
		||||
	     mptcp_send_head(sk) ||
 | 
			
		||||
	     (inet_sk_state_load(sk) != TCP_ESTABLISHED)))
 | 
			
		||||
		mptcp_schedule_work(sk);
 | 
			
		||||
| 
						 | 
				
			
			@ -835,20 +835,6 @@ static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag)
 | 
			
		|||
	put_page(dfrag->page);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static bool mptcp_is_writeable(struct mptcp_sock *msk)
 | 
			
		||||
{
 | 
			
		||||
	struct mptcp_subflow_context *subflow;
 | 
			
		||||
 | 
			
		||||
	if (!sk_stream_is_writeable((struct sock *)msk))
 | 
			
		||||
		return false;
 | 
			
		||||
 | 
			
		||||
	mptcp_for_each_subflow(msk, subflow) {
 | 
			
		||||
		if (sk_stream_is_writeable(subflow->tcp_sock))
 | 
			
		||||
			return true;
 | 
			
		||||
	}
 | 
			
		||||
	return false;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void mptcp_clean_una(struct sock *sk)
 | 
			
		||||
{
 | 
			
		||||
	struct mptcp_sock *msk = mptcp_sk(sk);
 | 
			
		||||
| 
						 | 
				
			
			@ -901,13 +887,8 @@ static void mptcp_clean_una_wakeup(struct sock *sk)
 | 
			
		|||
	mptcp_clean_una(sk);
 | 
			
		||||
 | 
			
		||||
	/* Only wake up writers if a subflow is ready */
 | 
			
		||||
	if (mptcp_is_writeable(msk)) {
 | 
			
		||||
		set_bit(MPTCP_SEND_SPACE, &msk->flags);
 | 
			
		||||
		smp_mb__after_atomic();
 | 
			
		||||
 | 
			
		||||
		/* set SEND_SPACE before sk_stream_write_space clears
 | 
			
		||||
		 * NOSPACE
 | 
			
		||||
		 */
 | 
			
		||||
	if (sk_stream_is_writeable(sk)) {
 | 
			
		||||
		clear_bit(MPTCP_NOSPACE, &msk->flags);
 | 
			
		||||
		sk_stream_write_space(sk);
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			@ -1041,17 +1022,25 @@ static void mptcp_nospace(struct mptcp_sock *msk)
 | 
			
		|||
{
 | 
			
		||||
	struct mptcp_subflow_context *subflow;
 | 
			
		||||
 | 
			
		||||
	clear_bit(MPTCP_SEND_SPACE, &msk->flags);
 | 
			
		||||
	set_bit(MPTCP_NOSPACE, &msk->flags);
 | 
			
		||||
	smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */
 | 
			
		||||
 | 
			
		||||
	mptcp_for_each_subflow(msk, subflow) {
 | 
			
		||||
		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
 | 
			
		||||
		bool ssk_writeable = sk_stream_is_writeable(ssk);
 | 
			
		||||
		struct socket *sock = READ_ONCE(ssk->sk_socket);
 | 
			
		||||
 | 
			
		||||
		if (ssk_writeable || !sock)
 | 
			
		||||
			continue;
 | 
			
		||||
 | 
			
		||||
		/* enables ssk->write_space() callbacks */
 | 
			
		||||
		if (sock)
 | 
			
		||||
			set_bit(SOCK_NOSPACE, &sock->flags);
 | 
			
		||||
		set_bit(SOCK_NOSPACE, &sock->flags);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	/* mptcp_data_acked() could run just before we set the NOSPACE bit,
 | 
			
		||||
	 * so explicitly check for snd_una value
 | 
			
		||||
	 */
 | 
			
		||||
	mptcp_clean_una((struct sock *)msk);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
 | 
			
		||||
| 
						 | 
				
			
			@ -1155,12 +1144,6 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
 | 
			
		|||
	return NULL;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void ssk_check_wmem(struct mptcp_sock *msk)
 | 
			
		||||
{
 | 
			
		||||
	if (unlikely(!mptcp_is_writeable(msk)))
 | 
			
		||||
		mptcp_nospace(msk);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void mptcp_push_release(struct sock *sk, struct sock *ssk,
 | 
			
		||||
			       struct mptcp_sendmsg_info *info)
 | 
			
		||||
{
 | 
			
		||||
| 
						 | 
				
			
			@ -1332,7 +1315,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 | 
			
		|||
 | 
			
		||||
wait_for_memory:
 | 
			
		||||
		mptcp_nospace(msk);
 | 
			
		||||
		mptcp_clean_una(sk);
 | 
			
		||||
		if (mptcp_timer_pending(sk))
 | 
			
		||||
			mptcp_reset_timer(sk);
 | 
			
		||||
		ret = sk_stream_wait_memory(sk, &timeo);
 | 
			
		||||
| 
						 | 
				
			
			@ -1344,7 +1326,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 | 
			
		|||
		mptcp_push_pending(sk, msg->msg_flags);
 | 
			
		||||
 | 
			
		||||
out:
 | 
			
		||||
	ssk_check_wmem(msk);
 | 
			
		||||
	release_sock(sk);
 | 
			
		||||
	return copied ? : ret;
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			@ -1921,7 +1902,6 @@ static int __mptcp_init_sock(struct sock *sk)
 | 
			
		|||
	INIT_LIST_HEAD(&msk->conn_list);
 | 
			
		||||
	INIT_LIST_HEAD(&msk->join_list);
 | 
			
		||||
	INIT_LIST_HEAD(&msk->rtx_queue);
 | 
			
		||||
	__set_bit(MPTCP_SEND_SPACE, &msk->flags);
 | 
			
		||||
	INIT_WORK(&msk->work, mptcp_worker);
 | 
			
		||||
	msk->out_of_order_queue = RB_ROOT;
 | 
			
		||||
	msk->first_pending = NULL;
 | 
			
		||||
| 
						 | 
				
			
			@ -2619,13 +2599,6 @@ bool mptcp_finish_join(struct sock *ssk)
 | 
			
		|||
	return true;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static bool mptcp_memory_free(const struct sock *sk, int wake)
 | 
			
		||||
{
 | 
			
		||||
	struct mptcp_sock *msk = mptcp_sk(sk);
 | 
			
		||||
 | 
			
		||||
	return wake ? test_bit(MPTCP_SEND_SPACE, &msk->flags) : true;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static struct proto mptcp_prot = {
 | 
			
		||||
	.name		= "MPTCP",
 | 
			
		||||
	.owner		= THIS_MODULE,
 | 
			
		||||
| 
						 | 
				
			
			@ -2646,7 +2619,6 @@ static struct proto mptcp_prot = {
 | 
			
		|||
	.sockets_allocated	= &mptcp_sockets_allocated,
 | 
			
		||||
	.memory_allocated	= &tcp_memory_allocated,
 | 
			
		||||
	.memory_pressure	= &tcp_memory_pressure,
 | 
			
		||||
	.stream_memory_free	= mptcp_memory_free,
 | 
			
		||||
	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
 | 
			
		||||
	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
 | 
			
		||||
	.sysctl_mem	= sysctl_tcp_mem,
 | 
			
		||||
| 
						 | 
				
			
			@ -2820,6 +2792,39 @@ static __poll_t mptcp_check_readable(struct mptcp_sock *msk)
 | 
			
		|||
	       0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static bool __mptcp_check_writeable(struct mptcp_sock *msk)
 | 
			
		||||
{
 | 
			
		||||
	struct sock *sk = (struct sock *)msk;
 | 
			
		||||
	bool mptcp_writable;
 | 
			
		||||
 | 
			
		||||
	mptcp_clean_una(sk);
 | 
			
		||||
	mptcp_writable = sk_stream_is_writeable(sk);
 | 
			
		||||
	if (!mptcp_writable)
 | 
			
		||||
		mptcp_nospace(msk);
 | 
			
		||||
 | 
			
		||||
	return mptcp_writable;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static __poll_t mptcp_check_writeable(struct mptcp_sock *msk)
 | 
			
		||||
{
 | 
			
		||||
	struct sock *sk = (struct sock *)msk;
 | 
			
		||||
	__poll_t ret = 0;
 | 
			
		||||
	bool slow;
 | 
			
		||||
 | 
			
		||||
	if (unlikely(sk->sk_shutdown & SEND_SHUTDOWN))
 | 
			
		||||
		return 0;
 | 
			
		||||
 | 
			
		||||
	if (sk_stream_is_writeable(sk))
 | 
			
		||||
		return EPOLLOUT | EPOLLWRNORM;
 | 
			
		||||
 | 
			
		||||
	slow = lock_sock_fast(sk);
 | 
			
		||||
	if (__mptcp_check_writeable(msk))
 | 
			
		||||
		ret = EPOLLOUT | EPOLLWRNORM;
 | 
			
		||||
 | 
			
		||||
	unlock_sock_fast(sk, slow);
 | 
			
		||||
	return ret;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static __poll_t mptcp_poll(struct file *file, struct socket *sock,
 | 
			
		||||
			   struct poll_table_struct *wait)
 | 
			
		||||
{
 | 
			
		||||
| 
						 | 
				
			
			@ -2838,8 +2843,7 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
 | 
			
		|||
 | 
			
		||||
	if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) {
 | 
			
		||||
		mask |= mptcp_check_readable(msk);
 | 
			
		||||
		if (test_bit(MPTCP_SEND_SPACE, &msk->flags))
 | 
			
		||||
			mask |= EPOLLOUT | EPOLLWRNORM;
 | 
			
		||||
		mask |= mptcp_check_writeable(msk);
 | 
			
		||||
	}
 | 
			
		||||
	if (sk->sk_shutdown & RCV_SHUTDOWN)
 | 
			
		||||
		mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -86,7 +86,7 @@
 | 
			
		|||
 | 
			
		||||
/* MPTCP socket flags */
 | 
			
		||||
#define MPTCP_DATA_READY	0
 | 
			
		||||
#define MPTCP_SEND_SPACE	1
 | 
			
		||||
#define MPTCP_NOSPACE		1
 | 
			
		||||
#define MPTCP_WORK_RTX		2
 | 
			
		||||
#define MPTCP_WORK_EOF		3
 | 
			
		||||
#define MPTCP_FALLBACK_DONE	4
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -997,17 +997,16 @@ static void subflow_data_ready(struct sock *sk)
 | 
			
		|||
static void subflow_write_space(struct sock *sk)
 | 
			
		||||
{
 | 
			
		||||
	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
 | 
			
		||||
	struct socket *sock = READ_ONCE(sk->sk_socket);
 | 
			
		||||
	struct sock *parent = subflow->conn;
 | 
			
		||||
 | 
			
		||||
	if (!sk_stream_is_writeable(sk))
 | 
			
		||||
		return;
 | 
			
		||||
 | 
			
		||||
	if (sk_stream_is_writeable(parent)) {
 | 
			
		||||
		set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags);
 | 
			
		||||
		smp_mb__after_atomic();
 | 
			
		||||
		/* set SEND_SPACE before sk_stream_write_space clears NOSPACE */
 | 
			
		||||
		sk_stream_write_space(parent);
 | 
			
		||||
	}
 | 
			
		||||
	if (sock && sk_stream_is_writeable(parent))
 | 
			
		||||
		clear_bit(SOCK_NOSPACE, &sock->flags);
 | 
			
		||||
 | 
			
		||||
	sk_stream_write_space(parent);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static struct inet_connection_sock_af_ops *
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue