forked from mirrors/linux
		
	mptcp: Use full MPTCP-level disconnect state machine
RFC 8684 appendix D describes the connection state machine for MPTCP. This patch implements the DATA_FIN / DATA_ACK exchanges and MPTCP-level socket state changes described in that appendix, rather than simply sending DATA_FIN along with TCP FIN when disconnecting subflows. DATA_FIN is now sent and acknowledged before shutting down the subflows. Received DATA_FIN information (if not part of a data packet) is written to the MPTCP socket when the incoming DSS option is parsed by the subflow, and the MPTCP worker is scheduled to process the flag. DATA_FIN received as part of a full DSS mapping will be handled when the mapping is processed. The DATA_FIN is acknowledged by the worker if the reader is caught up. If there is still data to be moved to the MPTCP-level queue, ack_seq will be incremented to account for the DATA_FIN when it reaches the end of the stream and a DATA_ACK will be sent to the peer. Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
		
							parent
							
								
									16a9a9da17
								
							
						
					
					
						commit
						43b54c6ee3
					
				
					 3 changed files with 92 additions and 17 deletions
				
			
		|  | @ -868,6 +868,17 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, | ||||||
| 	if (mp_opt.use_ack) | 	if (mp_opt.use_ack) | ||||||
| 		update_una(msk, &mp_opt); | 		update_una(msk, &mp_opt); | ||||||
| 
 | 
 | ||||||
|  | 	/* Zero-length packets, like bare ACKs carrying a DATA_FIN, are
 | ||||||
|  | 	 * dropped by the caller and not propagated to the MPTCP layer. | ||||||
|  | 	 * Copy the DATA_FIN information now. | ||||||
|  | 	 */ | ||||||
|  | 	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { | ||||||
|  | 		if (mp_opt.data_fin && mp_opt.data_len == 1 && | ||||||
|  | 		    mptcp_update_rcv_data_fin(msk, mp_opt.data_seq) && | ||||||
|  | 		    schedule_work(&msk->work)) | ||||||
|  | 			sock_hold(subflow->conn); | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
| 	mpext = skb_ext_add(skb, SKB_EXT_MPTCP); | 	mpext = skb_ext_add(skb, SKB_EXT_MPTCP); | ||||||
| 	if (!mpext) | 	if (!mpext) | ||||||
| 		return; | 		return; | ||||||
|  |  | ||||||
|  | @ -381,6 +381,15 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, | ||||||
| 
 | 
 | ||||||
| 	*bytes = moved; | 	*bytes = moved; | ||||||
| 
 | 
 | ||||||
|  | 	/* If the moves have caught up with the DATA_FIN sequence number
 | ||||||
|  | 	 * it's time to ack the DATA_FIN and change socket state, but | ||||||
|  | 	 * this is not a good place to change state. Let the workqueue | ||||||
|  | 	 * do it. | ||||||
|  | 	 */ | ||||||
|  | 	if (mptcp_pending_data_fin(sk, NULL) && | ||||||
|  | 	    schedule_work(&msk->work)) | ||||||
|  | 		sock_hold(sk); | ||||||
|  | 
 | ||||||
| 	return done; | 	return done; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | @ -466,7 +475,8 @@ void mptcp_data_acked(struct sock *sk) | ||||||
| { | { | ||||||
| 	mptcp_reset_timer(sk); | 	mptcp_reset_timer(sk); | ||||||
| 
 | 
 | ||||||
| 	if (!sk_stream_is_writeable(sk) && | 	if ((!sk_stream_is_writeable(sk) || | ||||||
|  | 	     (inet_sk_state_load(sk) != TCP_ESTABLISHED)) && | ||||||
| 	    schedule_work(&mptcp_sk(sk)->work)) | 	    schedule_work(&mptcp_sk(sk)->work)) | ||||||
| 		sock_hold(sk); | 		sock_hold(sk); | ||||||
| } | } | ||||||
|  | @ -1384,6 +1394,7 @@ static void mptcp_worker(struct work_struct *work) | ||||||
| 
 | 
 | ||||||
| 	lock_sock(sk); | 	lock_sock(sk); | ||||||
| 	mptcp_clean_una(sk); | 	mptcp_clean_una(sk); | ||||||
|  | 	mptcp_check_data_fin_ack(sk); | ||||||
| 	__mptcp_flush_join_list(msk); | 	__mptcp_flush_join_list(msk); | ||||||
| 	__mptcp_move_skbs(msk); | 	__mptcp_move_skbs(msk); | ||||||
| 
 | 
 | ||||||
|  | @ -1393,6 +1404,8 @@ static void mptcp_worker(struct work_struct *work) | ||||||
| 	if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) | 	if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) | ||||||
| 		mptcp_check_for_eof(msk); | 		mptcp_check_for_eof(msk); | ||||||
| 
 | 
 | ||||||
|  | 	mptcp_check_data_fin(sk); | ||||||
|  | 
 | ||||||
| 	if (!test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) | 	if (!test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) | ||||||
| 		goto unlock; | 		goto unlock; | ||||||
| 
 | 
 | ||||||
|  | @ -1515,7 +1528,7 @@ static void mptcp_cancel_work(struct sock *sk) | ||||||
| 		sock_put(sk); | 		sock_put(sk); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static void mptcp_subflow_shutdown(struct sock *ssk, int how) | static void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how) | ||||||
| { | { | ||||||
| 	lock_sock(ssk); | 	lock_sock(ssk); | ||||||
| 
 | 
 | ||||||
|  | @ -1528,8 +1541,15 @@ static void mptcp_subflow_shutdown(struct sock *ssk, int how) | ||||||
| 		tcp_disconnect(ssk, O_NONBLOCK); | 		tcp_disconnect(ssk, O_NONBLOCK); | ||||||
| 		break; | 		break; | ||||||
| 	default: | 	default: | ||||||
| 		ssk->sk_shutdown |= how; | 		if (__mptcp_check_fallback(mptcp_sk(sk))) { | ||||||
| 		tcp_shutdown(ssk, how); | 			pr_debug("Fallback"); | ||||||
|  | 			ssk->sk_shutdown |= how; | ||||||
|  | 			tcp_shutdown(ssk, how); | ||||||
|  | 		} else { | ||||||
|  | 			pr_debug("Sending DATA_FIN on subflow %p", ssk); | ||||||
|  | 			mptcp_set_timeout(sk, ssk); | ||||||
|  | 			tcp_send_ack(ssk); | ||||||
|  | 		} | ||||||
| 		break; | 		break; | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | @ -1570,9 +1590,35 @@ static void mptcp_close(struct sock *sk, long timeout) | ||||||
| 	LIST_HEAD(conn_list); | 	LIST_HEAD(conn_list); | ||||||
| 
 | 
 | ||||||
| 	lock_sock(sk); | 	lock_sock(sk); | ||||||
|  | 	sk->sk_shutdown = SHUTDOWN_MASK; | ||||||
| 
 | 
 | ||||||
|  | 	if (sk->sk_state == TCP_LISTEN) { | ||||||
|  | 		inet_sk_state_store(sk, TCP_CLOSE); | ||||||
|  | 		goto cleanup; | ||||||
|  | 	} else if (sk->sk_state == TCP_CLOSE) { | ||||||
|  | 		goto cleanup; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	if (__mptcp_check_fallback(msk)) { | ||||||
|  | 		goto update_state; | ||||||
|  | 	} else if (mptcp_close_state(sk)) { | ||||||
|  | 		pr_debug("Sending DATA_FIN sk=%p", sk); | ||||||
|  | 		WRITE_ONCE(msk->write_seq, msk->write_seq + 1); | ||||||
|  | 		WRITE_ONCE(msk->snd_data_fin_enable, 1); | ||||||
|  | 
 | ||||||
|  | 		mptcp_for_each_subflow(msk, subflow) { | ||||||
|  | 			struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); | ||||||
|  | 
 | ||||||
|  | 			mptcp_subflow_shutdown(sk, tcp_sk, SHUTDOWN_MASK); | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	sk_stream_wait_close(sk, timeout); | ||||||
|  | 
 | ||||||
|  | update_state: | ||||||
| 	inet_sk_state_store(sk, TCP_CLOSE); | 	inet_sk_state_store(sk, TCP_CLOSE); | ||||||
| 
 | 
 | ||||||
|  | cleanup: | ||||||
| 	/* be sure to always acquire the join list lock, to sync vs
 | 	/* be sure to always acquire the join list lock, to sync vs
 | ||||||
| 	 * mptcp_finish_join(). | 	 * mptcp_finish_join(). | ||||||
| 	 */ | 	 */ | ||||||
|  | @ -1581,8 +1627,6 @@ static void mptcp_close(struct sock *sk, long timeout) | ||||||
| 	spin_unlock_bh(&msk->join_list_lock); | 	spin_unlock_bh(&msk->join_list_lock); | ||||||
| 	list_splice_init(&msk->conn_list, &conn_list); | 	list_splice_init(&msk->conn_list, &conn_list); | ||||||
| 
 | 
 | ||||||
| 	msk->snd_data_fin_enable = 1; |  | ||||||
| 
 |  | ||||||
| 	__mptcp_clear_xmit(sk); | 	__mptcp_clear_xmit(sk); | ||||||
| 
 | 
 | ||||||
| 	release_sock(sk); | 	release_sock(sk); | ||||||
|  | @ -2265,11 +2309,8 @@ static int mptcp_shutdown(struct socket *sock, int how) | ||||||
| 	pr_debug("sk=%p, how=%d", msk, how); | 	pr_debug("sk=%p, how=%d", msk, how); | ||||||
| 
 | 
 | ||||||
| 	lock_sock(sock->sk); | 	lock_sock(sock->sk); | ||||||
| 	if (how == SHUT_WR || how == SHUT_RDWR) |  | ||||||
| 		inet_sk_state_store(sock->sk, TCP_FIN_WAIT1); |  | ||||||
| 
 | 
 | ||||||
| 	how++; | 	how++; | ||||||
| 
 |  | ||||||
| 	if ((how & ~SHUTDOWN_MASK) || !how) { | 	if ((how & ~SHUTDOWN_MASK) || !how) { | ||||||
| 		ret = -EINVAL; | 		ret = -EINVAL; | ||||||
| 		goto out_unlock; | 		goto out_unlock; | ||||||
|  | @ -2283,13 +2324,31 @@ static int mptcp_shutdown(struct socket *sock, int how) | ||||||
| 			sock->state = SS_CONNECTED; | 			sock->state = SS_CONNECTED; | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	__mptcp_flush_join_list(msk); | 	/* If we've already sent a FIN, or it's a closed state, skip this. */ | ||||||
| 	msk->snd_data_fin_enable = 1; | 	if (__mptcp_check_fallback(msk)) { | ||||||
|  | 		if (how == SHUT_WR || how == SHUT_RDWR) | ||||||
|  | 			inet_sk_state_store(sock->sk, TCP_FIN_WAIT1); | ||||||
| 
 | 
 | ||||||
| 	mptcp_for_each_subflow(msk, subflow) { | 		mptcp_for_each_subflow(msk, subflow) { | ||||||
| 		struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); | 			struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); | ||||||
| 
 | 
 | ||||||
| 		mptcp_subflow_shutdown(tcp_sk, how); | 			mptcp_subflow_shutdown(sock->sk, tcp_sk, how); | ||||||
|  | 		} | ||||||
|  | 	} else if ((how & SEND_SHUTDOWN) && | ||||||
|  | 		   ((1 << sock->sk->sk_state) & | ||||||
|  | 		    (TCPF_ESTABLISHED | TCPF_SYN_SENT | | ||||||
|  | 		     TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) && | ||||||
|  | 		   mptcp_close_state(sock->sk)) { | ||||||
|  | 		__mptcp_flush_join_list(msk); | ||||||
|  | 
 | ||||||
|  | 		WRITE_ONCE(msk->write_seq, msk->write_seq + 1); | ||||||
|  | 		WRITE_ONCE(msk->snd_data_fin_enable, 1); | ||||||
|  | 
 | ||||||
|  | 		mptcp_for_each_subflow(msk, subflow) { | ||||||
|  | 			struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); | ||||||
|  | 
 | ||||||
|  | 			mptcp_subflow_shutdown(sock->sk, tcp_sk, how); | ||||||
|  | 		} | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	/* Wake up anyone sleeping in poll. */ | 	/* Wake up anyone sleeping in poll. */ | ||||||
|  |  | ||||||
|  | @ -598,7 +598,8 @@ static bool validate_mapping(struct sock *ssk, struct sk_buff *skb) | ||||||
| 	return true; | 	return true; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static enum mapping_status get_mapping_status(struct sock *ssk) | static enum mapping_status get_mapping_status(struct sock *ssk, | ||||||
|  | 					      struct mptcp_sock *msk) | ||||||
| { | { | ||||||
| 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); | 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); | ||||||
| 	struct mptcp_ext *mpext; | 	struct mptcp_ext *mpext; | ||||||
|  | @ -648,7 +649,8 @@ static enum mapping_status get_mapping_status(struct sock *ssk) | ||||||
| 
 | 
 | ||||||
| 	if (mpext->data_fin == 1) { | 	if (mpext->data_fin == 1) { | ||||||
| 		if (data_len == 1) { | 		if (data_len == 1) { | ||||||
| 			pr_debug("DATA_FIN with no payload"); | 			mptcp_update_rcv_data_fin(msk, mpext->data_seq); | ||||||
|  | 			pr_debug("DATA_FIN with no payload seq=%llu", mpext->data_seq); | ||||||
| 			if (subflow->map_valid) { | 			if (subflow->map_valid) { | ||||||
| 				/* A DATA_FIN might arrive in a DSS
 | 				/* A DATA_FIN might arrive in a DSS
 | ||||||
| 				 * option before the previous mapping | 				 * option before the previous mapping | ||||||
|  | @ -660,6 +662,9 @@ static enum mapping_status get_mapping_status(struct sock *ssk) | ||||||
| 			} else { | 			} else { | ||||||
| 				return MAPPING_DATA_FIN; | 				return MAPPING_DATA_FIN; | ||||||
| 			} | 			} | ||||||
|  | 		} else { | ||||||
|  | 			mptcp_update_rcv_data_fin(msk, mpext->data_seq + data_len); | ||||||
|  | 			pr_debug("DATA_FIN with mapping seq=%llu", mpext->data_seq + data_len); | ||||||
| 		} | 		} | ||||||
| 
 | 
 | ||||||
| 		/* Adjust for DATA_FIN using 1 byte of sequence space */ | 		/* Adjust for DATA_FIN using 1 byte of sequence space */ | ||||||
|  | @ -748,7 +753,7 @@ static bool subflow_check_data_avail(struct sock *ssk) | ||||||
| 		u64 ack_seq; | 		u64 ack_seq; | ||||||
| 		u64 old_ack; | 		u64 old_ack; | ||||||
| 
 | 
 | ||||||
| 		status = get_mapping_status(ssk); | 		status = get_mapping_status(ssk, msk); | ||||||
| 		pr_debug("msk=%p ssk=%p status=%d", msk, ssk, status); | 		pr_debug("msk=%p ssk=%p status=%d", msk, ssk, status); | ||||||
| 		if (status == MAPPING_INVALID) { | 		if (status == MAPPING_INVALID) { | ||||||
| 			ssk->sk_err = EBADMSG; | 			ssk->sk_err = EBADMSG; | ||||||
|  |  | ||||||
		Loading…
	
		Reference in a new issue
	
	 Mat Martineau
						Mat Martineau