forked from mirrors/linux
		
	mptcp: allow picking different xmit subflows
Update the scheduler to less trivial heuristic: cache the last used subflow, and try to send on it a reasonably long burst of data. When the burst or the subflow send space is exhausted, pick the subflow with the lower ratio between write space and send buffer - that is, the subflow with the greater relative amount of free space. v1 -> v2: - fix 32 bit build breakage due to 64bits div - fix checkpath issues (uint64_t -> u64) Signed-off-by: Paolo Abeni <pabeni@redhat.com> Reviewed-by: Mat Martineau <mathew.j.martineau@linux.intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
		
							parent
							
								
									4596a2c1b7
								
							
						
					
					
						commit
						d5f49190de
					
				
					 2 changed files with 105 additions and 24 deletions
				
			
		|  | @ -1031,41 +1031,105 @@ static void mptcp_nospace(struct mptcp_sock *msk) | |||
| 	} | ||||
| } | ||||
| 
 | ||||
| static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) | ||||
| { | ||||
| 	struct sock *ssk = mptcp_subflow_tcp_sock(subflow); | ||||
| 
 | ||||
| 	/* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */ | ||||
| 	if (subflow->request_join && !subflow->fully_established) | ||||
| 		return false; | ||||
| 
 | ||||
| 	/* only send if our side has not closed yet */ | ||||
| 	return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)); | ||||
| } | ||||
| 
 | ||||
| #define MPTCP_SEND_BURST_SIZE		((1 << 16) - \ | ||||
| 					 sizeof(struct tcphdr) - \ | ||||
| 					 MAX_TCP_OPTION_SPACE - \ | ||||
| 					 sizeof(struct ipv6hdr) - \ | ||||
| 					 sizeof(struct frag_hdr)) | ||||
| 
 | ||||
| struct subflow_send_info { | ||||
| 	struct sock *ssk; | ||||
| 	u64 ratio; | ||||
| }; | ||||
| 
 | ||||
| static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk, | ||||
| 					   u32 *sndbuf) | ||||
| { | ||||
| 	struct subflow_send_info send_info[2]; | ||||
| 	struct mptcp_subflow_context *subflow; | ||||
| 	struct sock *sk = (struct sock *)msk; | ||||
| 	struct sock *backup = NULL; | ||||
| 	bool free; | ||||
| 	int i, nr_active = 0; | ||||
| 	struct sock *ssk; | ||||
| 	u64 ratio; | ||||
| 	u32 pace; | ||||
| 
 | ||||
| 	sock_owned_by_me(sk); | ||||
| 	sock_owned_by_me((struct sock *)msk); | ||||
| 
 | ||||
| 	*sndbuf = 0; | ||||
| 	if (!mptcp_ext_cache_refill(msk)) | ||||
| 		return NULL; | ||||
| 
 | ||||
| 	mptcp_for_each_subflow(msk, subflow) { | ||||
| 		struct sock *ssk = mptcp_subflow_tcp_sock(subflow); | ||||
| 
 | ||||
| 		free = sk_stream_is_writeable(subflow->tcp_sock); | ||||
| 		if (!free) { | ||||
| 			mptcp_nospace(msk); | ||||
| 	if (__mptcp_check_fallback(msk)) { | ||||
| 		if (!msk->first) | ||||
| 			return NULL; | ||||
| 		*sndbuf = msk->first->sk_sndbuf; | ||||
| 		return sk_stream_memory_free(msk->first) ? msk->first : NULL; | ||||
| 	} | ||||
| 
 | ||||
| 	/* re-use last subflow, if the burst allow that */ | ||||
| 	if (msk->last_snd && msk->snd_burst > 0 && | ||||
| 	    sk_stream_memory_free(msk->last_snd) && | ||||
| 	    mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) { | ||||
| 		mptcp_for_each_subflow(msk, subflow) { | ||||
| 			ssk =  mptcp_subflow_tcp_sock(subflow); | ||||
| 			*sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf); | ||||
| 		if (subflow->backup) { | ||||
| 			if (!backup) | ||||
| 				backup = ssk; | ||||
| 		} | ||||
| 		return msk->last_snd; | ||||
| 	} | ||||
| 
 | ||||
| 	/* pick the subflow with the lower wmem/wspace ratio */ | ||||
| 	for (i = 0; i < 2; ++i) { | ||||
| 		send_info[i].ssk = NULL; | ||||
| 		send_info[i].ratio = -1; | ||||
| 	} | ||||
| 	mptcp_for_each_subflow(msk, subflow) { | ||||
| 		ssk =  mptcp_subflow_tcp_sock(subflow); | ||||
| 		if (!mptcp_subflow_active(subflow)) | ||||
| 			continue; | ||||
| 
 | ||||
| 		nr_active += !subflow->backup; | ||||
| 		*sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf); | ||||
| 		if (!sk_stream_memory_free(subflow->tcp_sock)) | ||||
| 			continue; | ||||
| 
 | ||||
| 		pace = READ_ONCE(ssk->sk_pacing_rate); | ||||
| 		if (!pace) | ||||
| 			continue; | ||||
| 
 | ||||
| 		ratio = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32, | ||||
| 				pace); | ||||
| 		if (ratio < send_info[subflow->backup].ratio) { | ||||
| 			send_info[subflow->backup].ssk = ssk; | ||||
| 			send_info[subflow->backup].ratio = ratio; | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 		return ssk; | ||||
| 	} | ||||
| 	pr_debug("msk=%p nr_active=%d ssk=%p:%lld backup=%p:%lld", | ||||
| 		 msk, nr_active, send_info[0].ssk, send_info[0].ratio, | ||||
| 		 send_info[1].ssk, send_info[1].ratio); | ||||
| 
 | ||||
| 	return backup; | ||||
| 	/* pick the best backup if no other subflow is active */ | ||||
| 	if (!nr_active) | ||||
| 		send_info[0].ssk = send_info[1].ssk; | ||||
| 
 | ||||
| 	if (send_info[0].ssk) { | ||||
| 		msk->last_snd = send_info[0].ssk; | ||||
| 		msk->snd_burst = min_t(int, MPTCP_SEND_BURST_SIZE, | ||||
| 				       sk_stream_wspace(msk->last_snd)); | ||||
| 		return msk->last_snd; | ||||
| 	} | ||||
| 	return NULL; | ||||
| } | ||||
| 
 | ||||
| static void ssk_check_wmem(struct mptcp_sock *msk) | ||||
|  | @ -1160,6 +1224,10 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) | |||
| 			break; | ||||
| 		} | ||||
| 
 | ||||
| 		/* burst can be negative, we will try move to the next subflow
 | ||||
| 		 * at selection time, if possible. | ||||
| 		 */ | ||||
| 		msk->snd_burst -= ret; | ||||
| 		copied += ret; | ||||
| 
 | ||||
| 		tx_ok = msg_data_left(msg); | ||||
|  | @ -1375,6 +1443,11 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) | |||
| 	unsigned int moved = 0; | ||||
| 	bool done; | ||||
| 
 | ||||
| 	/* avoid looping forever below on racing close */ | ||||
| 	if (((struct sock *)msk)->sk_state == TCP_CLOSE) | ||||
| 		return false; | ||||
| 
 | ||||
| 	__mptcp_flush_join_list(msk); | ||||
| 	do { | ||||
| 		struct sock *ssk = mptcp_subflow_recv_lookup(msk); | ||||
| 
 | ||||
|  | @ -1539,9 +1612,15 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk) | |||
| 
 | ||||
| 	sock_owned_by_me((const struct sock *)msk); | ||||
| 
 | ||||
| 	if (__mptcp_check_fallback(msk)) | ||||
| 		return msk->first; | ||||
| 
 | ||||
| 	mptcp_for_each_subflow(msk, subflow) { | ||||
| 		struct sock *ssk = mptcp_subflow_tcp_sock(subflow); | ||||
| 
 | ||||
| 		if (!mptcp_subflow_active(subflow)) | ||||
| 			continue; | ||||
| 
 | ||||
| 		/* still data outstanding at TCP level?  Don't retransmit. */ | ||||
| 		if (!tcp_write_queue_empty(ssk)) | ||||
| 			return NULL; | ||||
|  |  | |||
|  | @ -196,6 +196,8 @@ struct mptcp_sock { | |||
| 	u64		write_seq; | ||||
| 	u64		ack_seq; | ||||
| 	u64		rcv_data_fin_seq; | ||||
| 	struct sock	*last_snd; | ||||
| 	int		snd_burst; | ||||
| 	atomic64_t	snd_una; | ||||
| 	unsigned long	timer_ival; | ||||
| 	u32		token; | ||||
|  | @ -473,12 +475,12 @@ static inline bool before64(__u64 seq1, __u64 seq2) | |||
| 
 | ||||
| void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops); | ||||
| 
 | ||||
| static inline bool __mptcp_check_fallback(struct mptcp_sock *msk) | ||||
| static inline bool __mptcp_check_fallback(const struct mptcp_sock *msk) | ||||
| { | ||||
| 	return test_bit(MPTCP_FALLBACK_DONE, &msk->flags); | ||||
| } | ||||
| 
 | ||||
| static inline bool mptcp_check_fallback(struct sock *sk) | ||||
| static inline bool mptcp_check_fallback(const struct sock *sk) | ||||
| { | ||||
| 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); | ||||
| 	struct mptcp_sock *msk = mptcp_sk(subflow->conn); | ||||
|  |  | |||
		Loading…
	
		Reference in a new issue
	
	 Paolo Abeni
						Paolo Abeni