mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 02:30:34 +02:00 
			
		
		
		
	mptcp: allow picking different xmit subflows
Update the scheduler to less trivial heuristic: cache the last used subflow, and try to send on it a reasonably long burst of data. When the burst or the subflow send space is exhausted, pick the subflow with the lower ratio between write space and send buffer - that is, the subflow with the greater relative amount of free space. v1 -> v2: - fix 32 bit build breakage due to 64bits div - fix checkpath issues (uint64_t -> u64) Signed-off-by: Paolo Abeni <pabeni@redhat.com> Reviewed-by: Mat Martineau <mathew.j.martineau@linux.intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
		
							parent
							
								
									4596a2c1b7
								
							
						
					
					
						commit
						d5f49190de
					
				
					 2 changed files with 105 additions and 24 deletions
				
			
		| 
						 | 
				
			
			@ -1031,41 +1031,105 @@ static void mptcp_nospace(struct mptcp_sock *msk)
 | 
			
		|||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
 | 
			
		||||
{
 | 
			
		||||
	struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
 | 
			
		||||
 | 
			
		||||
	/* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */
 | 
			
		||||
	if (subflow->request_join && !subflow->fully_established)
 | 
			
		||||
		return false;
 | 
			
		||||
 | 
			
		||||
	/* only send if our side has not closed yet */
 | 
			
		||||
	return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#define MPTCP_SEND_BURST_SIZE		((1 << 16) - \
 | 
			
		||||
					 sizeof(struct tcphdr) - \
 | 
			
		||||
					 MAX_TCP_OPTION_SPACE - \
 | 
			
		||||
					 sizeof(struct ipv6hdr) - \
 | 
			
		||||
					 sizeof(struct frag_hdr))
 | 
			
		||||
 | 
			
		||||
struct subflow_send_info {
 | 
			
		||||
	struct sock *ssk;
 | 
			
		||||
	u64 ratio;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
 | 
			
		||||
					   u32 *sndbuf)
 | 
			
		||||
{
 | 
			
		||||
	struct subflow_send_info send_info[2];
 | 
			
		||||
	struct mptcp_subflow_context *subflow;
 | 
			
		||||
	struct sock *sk = (struct sock *)msk;
 | 
			
		||||
	struct sock *backup = NULL;
 | 
			
		||||
	bool free;
 | 
			
		||||
	int i, nr_active = 0;
 | 
			
		||||
	struct sock *ssk;
 | 
			
		||||
	u64 ratio;
 | 
			
		||||
	u32 pace;
 | 
			
		||||
 | 
			
		||||
	sock_owned_by_me(sk);
 | 
			
		||||
	sock_owned_by_me((struct sock *)msk);
 | 
			
		||||
 | 
			
		||||
	*sndbuf = 0;
 | 
			
		||||
	if (!mptcp_ext_cache_refill(msk))
 | 
			
		||||
		return NULL;
 | 
			
		||||
 | 
			
		||||
	mptcp_for_each_subflow(msk, subflow) {
 | 
			
		||||
		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
 | 
			
		||||
 | 
			
		||||
		free = sk_stream_is_writeable(subflow->tcp_sock);
 | 
			
		||||
		if (!free) {
 | 
			
		||||
			mptcp_nospace(msk);
 | 
			
		||||
	if (__mptcp_check_fallback(msk)) {
 | 
			
		||||
		if (!msk->first)
 | 
			
		||||
			return NULL;
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		*sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf);
 | 
			
		||||
		if (subflow->backup) {
 | 
			
		||||
			if (!backup)
 | 
			
		||||
				backup = ssk;
 | 
			
		||||
 | 
			
		||||
			continue;
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		return ssk;
 | 
			
		||||
		*sndbuf = msk->first->sk_sndbuf;
 | 
			
		||||
		return sk_stream_memory_free(msk->first) ? msk->first : NULL;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return backup;
 | 
			
		||||
	/* re-use last subflow, if the burst allow that */
 | 
			
		||||
	if (msk->last_snd && msk->snd_burst > 0 &&
 | 
			
		||||
	    sk_stream_memory_free(msk->last_snd) &&
 | 
			
		||||
	    mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) {
 | 
			
		||||
		mptcp_for_each_subflow(msk, subflow) {
 | 
			
		||||
			ssk =  mptcp_subflow_tcp_sock(subflow);
 | 
			
		||||
			*sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf);
 | 
			
		||||
		}
 | 
			
		||||
		return msk->last_snd;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	/* pick the subflow with the lower wmem/wspace ratio */
 | 
			
		||||
	for (i = 0; i < 2; ++i) {
 | 
			
		||||
		send_info[i].ssk = NULL;
 | 
			
		||||
		send_info[i].ratio = -1;
 | 
			
		||||
	}
 | 
			
		||||
	mptcp_for_each_subflow(msk, subflow) {
 | 
			
		||||
		ssk =  mptcp_subflow_tcp_sock(subflow);
 | 
			
		||||
		if (!mptcp_subflow_active(subflow))
 | 
			
		||||
			continue;
 | 
			
		||||
 | 
			
		||||
		nr_active += !subflow->backup;
 | 
			
		||||
		*sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf);
 | 
			
		||||
		if (!sk_stream_memory_free(subflow->tcp_sock))
 | 
			
		||||
			continue;
 | 
			
		||||
 | 
			
		||||
		pace = READ_ONCE(ssk->sk_pacing_rate);
 | 
			
		||||
		if (!pace)
 | 
			
		||||
			continue;
 | 
			
		||||
 | 
			
		||||
		ratio = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32,
 | 
			
		||||
				pace);
 | 
			
		||||
		if (ratio < send_info[subflow->backup].ratio) {
 | 
			
		||||
			send_info[subflow->backup].ssk = ssk;
 | 
			
		||||
			send_info[subflow->backup].ratio = ratio;
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	pr_debug("msk=%p nr_active=%d ssk=%p:%lld backup=%p:%lld",
 | 
			
		||||
		 msk, nr_active, send_info[0].ssk, send_info[0].ratio,
 | 
			
		||||
		 send_info[1].ssk, send_info[1].ratio);
 | 
			
		||||
 | 
			
		||||
	/* pick the best backup if no other subflow is active */
 | 
			
		||||
	if (!nr_active)
 | 
			
		||||
		send_info[0].ssk = send_info[1].ssk;
 | 
			
		||||
 | 
			
		||||
	if (send_info[0].ssk) {
 | 
			
		||||
		msk->last_snd = send_info[0].ssk;
 | 
			
		||||
		msk->snd_burst = min_t(int, MPTCP_SEND_BURST_SIZE,
 | 
			
		||||
				       sk_stream_wspace(msk->last_snd));
 | 
			
		||||
		return msk->last_snd;
 | 
			
		||||
	}
 | 
			
		||||
	return NULL;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void ssk_check_wmem(struct mptcp_sock *msk)
 | 
			
		||||
| 
						 | 
				
			
			@ -1160,6 +1224,10 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 | 
			
		|||
			break;
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		/* burst can be negative, we will try move to the next subflow
 | 
			
		||||
		 * at selection time, if possible.
 | 
			
		||||
		 */
 | 
			
		||||
		msk->snd_burst -= ret;
 | 
			
		||||
		copied += ret;
 | 
			
		||||
 | 
			
		||||
		tx_ok = msg_data_left(msg);
 | 
			
		||||
| 
						 | 
				
			
			@ -1375,6 +1443,11 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk)
 | 
			
		|||
	unsigned int moved = 0;
 | 
			
		||||
	bool done;
 | 
			
		||||
 | 
			
		||||
	/* avoid looping forever below on racing close */
 | 
			
		||||
	if (((struct sock *)msk)->sk_state == TCP_CLOSE)
 | 
			
		||||
		return false;
 | 
			
		||||
 | 
			
		||||
	__mptcp_flush_join_list(msk);
 | 
			
		||||
	do {
 | 
			
		||||
		struct sock *ssk = mptcp_subflow_recv_lookup(msk);
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -1539,9 +1612,15 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)
 | 
			
		|||
 | 
			
		||||
	sock_owned_by_me((const struct sock *)msk);
 | 
			
		||||
 | 
			
		||||
	if (__mptcp_check_fallback(msk))
 | 
			
		||||
		return msk->first;
 | 
			
		||||
 | 
			
		||||
	mptcp_for_each_subflow(msk, subflow) {
 | 
			
		||||
		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
 | 
			
		||||
 | 
			
		||||
		if (!mptcp_subflow_active(subflow))
 | 
			
		||||
			continue;
 | 
			
		||||
 | 
			
		||||
		/* still data outstanding at TCP level?  Don't retransmit. */
 | 
			
		||||
		if (!tcp_write_queue_empty(ssk))
 | 
			
		||||
			return NULL;
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -196,6 +196,8 @@ struct mptcp_sock {
 | 
			
		|||
	u64		write_seq;
 | 
			
		||||
	u64		ack_seq;
 | 
			
		||||
	u64		rcv_data_fin_seq;
 | 
			
		||||
	struct sock	*last_snd;
 | 
			
		||||
	int		snd_burst;
 | 
			
		||||
	atomic64_t	snd_una;
 | 
			
		||||
	unsigned long	timer_ival;
 | 
			
		||||
	u32		token;
 | 
			
		||||
| 
						 | 
				
			
			@ -473,12 +475,12 @@ static inline bool before64(__u64 seq1, __u64 seq2)
 | 
			
		|||
 | 
			
		||||
void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops);
 | 
			
		||||
 | 
			
		||||
static inline bool __mptcp_check_fallback(struct mptcp_sock *msk)
 | 
			
		||||
static inline bool __mptcp_check_fallback(const struct mptcp_sock *msk)
 | 
			
		||||
{
 | 
			
		||||
	return test_bit(MPTCP_FALLBACK_DONE, &msk->flags);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static inline bool mptcp_check_fallback(struct sock *sk)
 | 
			
		||||
static inline bool mptcp_check_fallback(const struct sock *sk)
 | 
			
		||||
{
 | 
			
		||||
	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
 | 
			
		||||
	struct mptcp_sock *msk = mptcp_sk(subflow->conn);
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue