forked from mirrors/linux
		
	[TCP]: MTU probing
Implementation of packetization layer path mtu discovery for TCP, based on the internet-draft currently found at <http://www.ietf.org/internet-drafts/draft-ietf-pmtud-method-05.txt>. Signed-off-by: John Heffner <jheffner@psc.edu> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
		
							parent
							
								
									1d60290f27
								
							
						
					
					
						commit
						5d424d5a67
					
				
					 9 changed files with 326 additions and 37 deletions
				
			
		| 
						 | 
				
			
			@ -397,6 +397,8 @@ enum
 | 
			
		|||
	NET_TCP_CONG_CONTROL=110,
 | 
			
		||||
	NET_TCP_ABC=111,
 | 
			
		||||
	NET_IPV4_IPFRAG_MAX_DIST=112,
 | 
			
		||||
 	NET_TCP_MTU_PROBING=113,
 | 
			
		||||
	NET_TCP_BASE_MSS=114,
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
enum {
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -72,6 +72,7 @@ struct inet_connection_sock_af_ops {
 | 
			
		|||
 * @icsk_probes_out:	   unanswered 0 window probes
 | 
			
		||||
 * @icsk_ext_hdr_len:	   Network protocol overhead (IP/IPv6 options)
 | 
			
		||||
 * @icsk_ack:		   Delayed ACK control data
 | 
			
		||||
 * @icsk_mtup;		   MTU probing control data
 | 
			
		||||
 */
 | 
			
		||||
struct inet_connection_sock {
 | 
			
		||||
	/* inet_sock has to be the first member! */
 | 
			
		||||
| 
						 | 
				
			
			@ -104,6 +105,18 @@ struct inet_connection_sock {
 | 
			
		|||
		__u16		  last_seg_size; /* Size of last incoming segment	   */
 | 
			
		||||
		__u16		  rcv_mss;	 /* MSS used for delayed ACK decisions	   */ 
 | 
			
		||||
	} icsk_ack;
 | 
			
		||||
	struct {
 | 
			
		||||
		int		  enabled;
 | 
			
		||||
 | 
			
		||||
		/* Range of MTUs to search */
 | 
			
		||||
		int		  search_high;
 | 
			
		||||
		int		  search_low;
 | 
			
		||||
 | 
			
		||||
		/* Information on the current probe. */
 | 
			
		||||
		int		  probe_size;
 | 
			
		||||
		__u32		  probe_seq_start;
 | 
			
		||||
		__u32		  probe_seq_end;
 | 
			
		||||
	} icsk_mtup;
 | 
			
		||||
	u32			  icsk_ca_priv[16];
 | 
			
		||||
#define ICSK_CA_PRIV_SIZE	(16 * sizeof(u32))
 | 
			
		||||
};
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -60,6 +60,9 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
 | 
			
		|||
/* Minimal RCV_MSS. */
 | 
			
		||||
#define TCP_MIN_RCVMSS		536U
 | 
			
		||||
 | 
			
		||||
/* The least MTU to use for probing */
 | 
			
		||||
#define TCP_BASE_MSS		512
 | 
			
		||||
 | 
			
		||||
/* After receiving this amount of duplicate ACKs fast retransmit starts. */
 | 
			
		||||
#define TCP_FASTRETRANS_THRESH 3
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -219,6 +222,8 @@ extern int sysctl_tcp_nometrics_save;
 | 
			
		|||
extern int sysctl_tcp_moderate_rcvbuf;
 | 
			
		||||
extern int sysctl_tcp_tso_win_divisor;
 | 
			
		||||
extern int sysctl_tcp_abc;
 | 
			
		||||
extern int sysctl_tcp_mtu_probing;
 | 
			
		||||
extern int sysctl_tcp_base_mss;
 | 
			
		||||
 | 
			
		||||
extern atomic_t tcp_memory_allocated;
 | 
			
		||||
extern atomic_t tcp_sockets_allocated;
 | 
			
		||||
| 
						 | 
				
			
			@ -447,6 +452,10 @@ extern int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 | 
			
		|||
 | 
			
		||||
extern void tcp_initialize_rcv_mss(struct sock *sk);
 | 
			
		||||
 | 
			
		||||
extern int tcp_mtu_to_mss(struct sock *sk, int pmtu);
 | 
			
		||||
extern int tcp_mss_to_mtu(struct sock *sk, int mss);
 | 
			
		||||
extern void tcp_mtup_init(struct sock *sk);
 | 
			
		||||
 | 
			
		||||
static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
 | 
			
		||||
{
 | 
			
		||||
	tp->pred_flags = htonl((tp->tcp_header_len << 26) |
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -664,6 +664,22 @@ ctl_table ipv4_table[] = {
 | 
			
		|||
		.mode		= 0644,
 | 
			
		||||
		.proc_handler	= &proc_dointvec,
 | 
			
		||||
	},
 | 
			
		||||
	{
 | 
			
		||||
		.ctl_name	= NET_TCP_MTU_PROBING,
 | 
			
		||||
		.procname	= "tcp_mtu_probing",
 | 
			
		||||
		.data		= &sysctl_tcp_mtu_probing,
 | 
			
		||||
		.maxlen		= sizeof(int),
 | 
			
		||||
		.mode		= 0644,
 | 
			
		||||
		.proc_handler	= &proc_dointvec,
 | 
			
		||||
	},
 | 
			
		||||
	{
 | 
			
		||||
		.ctl_name	= NET_TCP_BASE_MSS,
 | 
			
		||||
		.procname	= "tcp_base_mss",
 | 
			
		||||
		.data		= &sysctl_tcp_base_mss,
 | 
			
		||||
		.maxlen		= sizeof(int),
 | 
			
		||||
		.mode		= 0644,
 | 
			
		||||
		.proc_handler	= &proc_dointvec,
 | 
			
		||||
	},
 | 
			
		||||
 | 
			
		||||
	{ .ctl_name = 0 }
 | 
			
		||||
};
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1891,6 +1891,34 @@ static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
 | 
			
		|||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void tcp_mtup_probe_failed(struct sock *sk)
 | 
			
		||||
{
 | 
			
		||||
	struct inet_connection_sock *icsk = inet_csk(sk);
 | 
			
		||||
 | 
			
		||||
	icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
 | 
			
		||||
	icsk->icsk_mtup.probe_size = 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb)
 | 
			
		||||
{
 | 
			
		||||
	struct tcp_sock *tp = tcp_sk(sk);
 | 
			
		||||
	struct inet_connection_sock *icsk = inet_csk(sk);
 | 
			
		||||
 | 
			
		||||
	/* FIXME: breaks with very large cwnd */
 | 
			
		||||
	tp->prior_ssthresh = tcp_current_ssthresh(sk);
 | 
			
		||||
	tp->snd_cwnd = tp->snd_cwnd *
 | 
			
		||||
		       tcp_mss_to_mtu(sk, tp->mss_cache) /
 | 
			
		||||
		       icsk->icsk_mtup.probe_size;
 | 
			
		||||
	tp->snd_cwnd_cnt = 0;
 | 
			
		||||
	tp->snd_cwnd_stamp = tcp_time_stamp;
 | 
			
		||||
	tp->rcv_ssthresh = tcp_current_ssthresh(sk);
 | 
			
		||||
 | 
			
		||||
	icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
 | 
			
		||||
	icsk->icsk_mtup.probe_size = 0;
 | 
			
		||||
	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
/* Process an event, which can update packets-in-flight not trivially.
 | 
			
		||||
 * Main goal of this function is to calculate new estimate for left_out,
 | 
			
		||||
 * taking into account both packets sitting in receiver's buffer and
 | 
			
		||||
| 
						 | 
				
			
			@ -2023,6 +2051,17 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
 | 
			
		|||
			return;
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		/* MTU probe failure: don't reduce cwnd */
 | 
			
		||||
		if (icsk->icsk_ca_state < TCP_CA_CWR &&
 | 
			
		||||
		    icsk->icsk_mtup.probe_size &&
 | 
			
		||||
		    tp->snd_una == icsk->icsk_mtup.probe_seq_start) {
 | 
			
		||||
			tcp_mtup_probe_failed(sk);
 | 
			
		||||
			/* Restores the reduction we did in tcp_mtup_probe() */
 | 
			
		||||
			tp->snd_cwnd++;
 | 
			
		||||
			tcp_simple_retransmit(sk);
 | 
			
		||||
			return;
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		/* Otherwise enter Recovery state */
 | 
			
		||||
 | 
			
		||||
		if (IsReno(tp))
 | 
			
		||||
| 
						 | 
				
			
			@ -2243,6 +2282,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
 | 
			
		|||
			tp->retrans_stamp = 0;
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		/* MTU probing checks */
 | 
			
		||||
		if (icsk->icsk_mtup.probe_size) {
 | 
			
		||||
			if (!after(icsk->icsk_mtup.probe_seq_end, TCP_SKB_CB(skb)->end_seq)) {
 | 
			
		||||
				tcp_mtup_probe_success(sk, skb);
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		if (sacked) {
 | 
			
		||||
			if (sacked & TCPCB_RETRANS) {
 | 
			
		||||
				if(sacked & TCPCB_SACKED_RETRANS)
 | 
			
		||||
| 
						 | 
				
			
			@ -4101,6 +4147,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 | 
			
		|||
		if (tp->rx_opt.sack_ok && sysctl_tcp_fack)
 | 
			
		||||
			tp->rx_opt.sack_ok |= 2;
 | 
			
		||||
 | 
			
		||||
		tcp_mtup_init(sk);
 | 
			
		||||
		tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
 | 
			
		||||
		tcp_initialize_rcv_mss(sk);
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -4211,6 +4258,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 | 
			
		|||
		if (tp->ecn_flags&TCP_ECN_OK)
 | 
			
		||||
			sock_set_flag(sk, SOCK_NO_LARGESEND);
 | 
			
		||||
 | 
			
		||||
		tcp_mtup_init(sk);
 | 
			
		||||
		tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
 | 
			
		||||
		tcp_initialize_rcv_mss(sk);
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -4399,6 +4447,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 | 
			
		|||
				 */
 | 
			
		||||
				tp->lsndtime = tcp_time_stamp;
 | 
			
		||||
 | 
			
		||||
				tcp_mtup_init(sk);
 | 
			
		||||
				tcp_initialize_rcv_mss(sk);
 | 
			
		||||
				tcp_init_buffer_space(sk);
 | 
			
		||||
				tcp_fast_path_on(tp);
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -900,6 +900,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 | 
			
		|||
		inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
 | 
			
		||||
	newinet->id = newtp->write_seq ^ jiffies;
 | 
			
		||||
 | 
			
		||||
	tcp_mtup_init(newsk);
 | 
			
		||||
	tcp_sync_mss(newsk, dst_mtu(dst));
 | 
			
		||||
	newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
 | 
			
		||||
	tcp_initialize_rcv_mss(newsk);
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -51,6 +51,12 @@ int sysctl_tcp_retrans_collapse = 1;
 | 
			
		|||
 */
 | 
			
		||||
int sysctl_tcp_tso_win_divisor = 3;
 | 
			
		||||
 | 
			
		||||
int sysctl_tcp_mtu_probing = 0;
 | 
			
		||||
int sysctl_tcp_base_mss = 512;
 | 
			
		||||
 | 
			
		||||
EXPORT_SYMBOL(sysctl_tcp_mtu_probing);
 | 
			
		||||
EXPORT_SYMBOL(sysctl_tcp_base_mss);
 | 
			
		||||
 | 
			
		||||
static void update_send_head(struct sock *sk, struct tcp_sock *tp,
 | 
			
		||||
			     struct sk_buff *skb)
 | 
			
		||||
{
 | 
			
		||||
| 
						 | 
				
			
			@ -681,6 +687,62 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
 | 
			
		|||
	return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/* Not accounting for SACKs here. */
 | 
			
		||||
int tcp_mtu_to_mss(struct sock *sk, int pmtu)
 | 
			
		||||
{
 | 
			
		||||
	struct tcp_sock *tp = tcp_sk(sk);
 | 
			
		||||
	struct inet_connection_sock *icsk = inet_csk(sk);
 | 
			
		||||
	int mss_now;
 | 
			
		||||
 | 
			
		||||
	/* Calculate base mss without TCP options:
 | 
			
		||||
	   It is MMS_S - sizeof(tcphdr) of rfc1122
 | 
			
		||||
	 */
 | 
			
		||||
	mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
 | 
			
		||||
 | 
			
		||||
	/* Clamp it (mss_clamp does not include tcp options) */
 | 
			
		||||
	if (mss_now > tp->rx_opt.mss_clamp)
 | 
			
		||||
		mss_now = tp->rx_opt.mss_clamp;
 | 
			
		||||
 | 
			
		||||
	/* Now subtract optional transport overhead */
 | 
			
		||||
	mss_now -= icsk->icsk_ext_hdr_len;
 | 
			
		||||
 | 
			
		||||
	/* Then reserve room for full set of TCP options and 8 bytes of data */
 | 
			
		||||
	if (mss_now < 48)
 | 
			
		||||
		mss_now = 48;
 | 
			
		||||
 | 
			
		||||
	/* Now subtract TCP options size, not including SACKs */
 | 
			
		||||
	mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
 | 
			
		||||
 | 
			
		||||
	return mss_now;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/* Inverse of above */
 | 
			
		||||
int tcp_mss_to_mtu(struct sock *sk, int mss)
 | 
			
		||||
{
 | 
			
		||||
	struct tcp_sock *tp = tcp_sk(sk);
 | 
			
		||||
	struct inet_connection_sock *icsk = inet_csk(sk);
 | 
			
		||||
	int mtu;
 | 
			
		||||
 | 
			
		||||
	mtu = mss +
 | 
			
		||||
	      tp->tcp_header_len +
 | 
			
		||||
	      icsk->icsk_ext_hdr_len +
 | 
			
		||||
	      icsk->icsk_af_ops->net_header_len;
 | 
			
		||||
 | 
			
		||||
	return mtu;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void tcp_mtup_init(struct sock *sk)
 | 
			
		||||
{
 | 
			
		||||
	struct tcp_sock *tp = tcp_sk(sk);
 | 
			
		||||
	struct inet_connection_sock *icsk = inet_csk(sk);
 | 
			
		||||
 | 
			
		||||
	icsk->icsk_mtup.enabled = sysctl_tcp_mtu_probing > 1;
 | 
			
		||||
	icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
 | 
			
		||||
	                       icsk->icsk_af_ops->net_header_len;
 | 
			
		||||
	icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss);
 | 
			
		||||
	icsk->icsk_mtup.probe_size = 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/* This function synchronize snd mss to current pmtu/exthdr set.
 | 
			
		||||
 | 
			
		||||
   tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
 | 
			
		||||
| 
						 | 
				
			
			@ -708,25 +770,12 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
 | 
			
		|||
{
 | 
			
		||||
	struct tcp_sock *tp = tcp_sk(sk);
 | 
			
		||||
	struct inet_connection_sock *icsk = inet_csk(sk);
 | 
			
		||||
	/* Calculate base mss without TCP options:
 | 
			
		||||
	   It is MMS_S - sizeof(tcphdr) of rfc1122
 | 
			
		||||
	 */
 | 
			
		||||
	int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len -
 | 
			
		||||
		       sizeof(struct tcphdr));
 | 
			
		||||
	int mss_now;
 | 
			
		||||
 | 
			
		||||
	/* Clamp it (mss_clamp does not include tcp options) */
 | 
			
		||||
	if (mss_now > tp->rx_opt.mss_clamp)
 | 
			
		||||
		mss_now = tp->rx_opt.mss_clamp;
 | 
			
		||||
	if (icsk->icsk_mtup.search_high > pmtu)
 | 
			
		||||
		icsk->icsk_mtup.search_high = pmtu;
 | 
			
		||||
 | 
			
		||||
	/* Now subtract optional transport overhead */
 | 
			
		||||
	mss_now -= icsk->icsk_ext_hdr_len;
 | 
			
		||||
 | 
			
		||||
	/* Then reserve room for full set of TCP options and 8 bytes of data */
 | 
			
		||||
	if (mss_now < 48)
 | 
			
		||||
		mss_now = 48;
 | 
			
		||||
 | 
			
		||||
	/* Now subtract TCP options size, not including SACKs */
 | 
			
		||||
	mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
 | 
			
		||||
	mss_now = tcp_mtu_to_mss(sk, pmtu);
 | 
			
		||||
 | 
			
		||||
	/* Bound mss with half of window */
 | 
			
		||||
	if (tp->max_window && mss_now > (tp->max_window>>1))
 | 
			
		||||
| 
						 | 
				
			
			@ -734,6 +783,8 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
 | 
			
		|||
 | 
			
		||||
	/* And store cached results */
 | 
			
		||||
	icsk->icsk_pmtu_cookie = pmtu;
 | 
			
		||||
	if (icsk->icsk_mtup.enabled)
 | 
			
		||||
		mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
 | 
			
		||||
	tp->mss_cache = mss_now;
 | 
			
		||||
 | 
			
		||||
	return mss_now;
 | 
			
		||||
| 
						 | 
				
			
			@ -1063,6 +1114,140 @@ static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_
 | 
			
		|||
	return 1;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/* Create a new MTU probe if we are ready.
 | 
			
		||||
 * Returns 0 if we should wait to probe (no cwnd available),
 | 
			
		||||
 *         1 if a probe was sent,
 | 
			
		||||
 *         -1 otherwise */
 | 
			
		||||
static int tcp_mtu_probe(struct sock *sk)
 | 
			
		||||
{
 | 
			
		||||
	struct tcp_sock *tp = tcp_sk(sk);
 | 
			
		||||
	struct inet_connection_sock *icsk = inet_csk(sk);
 | 
			
		||||
	struct sk_buff *skb, *nskb, *next;
 | 
			
		||||
	int len;
 | 
			
		||||
	int probe_size;
 | 
			
		||||
	unsigned int pif;
 | 
			
		||||
	int copy;
 | 
			
		||||
	int mss_now;
 | 
			
		||||
 | 
			
		||||
	/* Not currently probing/verifying,
 | 
			
		||||
	 * not in recovery,
 | 
			
		||||
	 * have enough cwnd, and
 | 
			
		||||
	 * not SACKing (the variable headers throw things off) */
 | 
			
		||||
	if (!icsk->icsk_mtup.enabled ||
 | 
			
		||||
	    icsk->icsk_mtup.probe_size ||
 | 
			
		||||
	    inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
 | 
			
		||||
	    tp->snd_cwnd < 11 ||
 | 
			
		||||
	    tp->rx_opt.eff_sacks)
 | 
			
		||||
		return -1;
 | 
			
		||||
 | 
			
		||||
	/* Very simple search strategy: just double the MSS. */
 | 
			
		||||
	mss_now = tcp_current_mss(sk, 0);
 | 
			
		||||
	probe_size = 2*tp->mss_cache;
 | 
			
		||||
	if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) {
 | 
			
		||||
		/* TODO: set timer for probe_converge_event */
 | 
			
		||||
		return -1;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	/* Have enough data in the send queue to probe? */
 | 
			
		||||
	len = 0;
 | 
			
		||||
	if ((skb = sk->sk_send_head) == NULL)
 | 
			
		||||
		return -1;
 | 
			
		||||
	while ((len += skb->len) < probe_size && !tcp_skb_is_last(sk, skb))
 | 
			
		||||
		skb = skb->next;
 | 
			
		||||
	if (len < probe_size)
 | 
			
		||||
		return -1;
 | 
			
		||||
 | 
			
		||||
	/* Receive window check. */
 | 
			
		||||
	if (after(TCP_SKB_CB(skb)->seq + probe_size, tp->snd_una + tp->snd_wnd)) {
 | 
			
		||||
		if (tp->snd_wnd < probe_size)
 | 
			
		||||
			return -1;
 | 
			
		||||
		else
 | 
			
		||||
			return 0;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	/* Do we need to wait to drain cwnd? */
 | 
			
		||||
	pif = tcp_packets_in_flight(tp);
 | 
			
		||||
	if (pif + 2 > tp->snd_cwnd) {
 | 
			
		||||
		/* With no packets in flight, don't stall. */
 | 
			
		||||
		if (pif == 0)
 | 
			
		||||
			return -1;
 | 
			
		||||
		else
 | 
			
		||||
			return 0;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	/* We're allowed to probe.  Build it now. */
 | 
			
		||||
	if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL)
 | 
			
		||||
		return -1;
 | 
			
		||||
	sk_charge_skb(sk, nskb);
 | 
			
		||||
 | 
			
		||||
	skb = sk->sk_send_head;
 | 
			
		||||
	__skb_insert(nskb, skb->prev, skb, &sk->sk_write_queue);
 | 
			
		||||
	sk->sk_send_head = nskb;
 | 
			
		||||
 | 
			
		||||
	TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
 | 
			
		||||
	TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
 | 
			
		||||
	TCP_SKB_CB(nskb)->flags = TCPCB_FLAG_ACK;
 | 
			
		||||
	TCP_SKB_CB(nskb)->sacked = 0;
 | 
			
		||||
	nskb->csum = 0;
 | 
			
		||||
	if (skb->ip_summed == CHECKSUM_HW)
 | 
			
		||||
		nskb->ip_summed = CHECKSUM_HW;
 | 
			
		||||
 | 
			
		||||
	len = 0;
 | 
			
		||||
	while (len < probe_size) {
 | 
			
		||||
		next = skb->next;
 | 
			
		||||
 | 
			
		||||
		copy = min_t(int, skb->len, probe_size - len);
 | 
			
		||||
		if (nskb->ip_summed)
 | 
			
		||||
			skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
 | 
			
		||||
		else
 | 
			
		||||
			nskb->csum = skb_copy_and_csum_bits(skb, 0,
 | 
			
		||||
			                 skb_put(nskb, copy), copy, nskb->csum);
 | 
			
		||||
 | 
			
		||||
		if (skb->len <= copy) {
 | 
			
		||||
			/* We've eaten all the data from this skb.
 | 
			
		||||
			 * Throw it away. */
 | 
			
		||||
			TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags;
 | 
			
		||||
			__skb_unlink(skb, &sk->sk_write_queue);
 | 
			
		||||
			sk_stream_free_skb(sk, skb);
 | 
			
		||||
		} else {
 | 
			
		||||
			TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags &
 | 
			
		||||
			                           ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
 | 
			
		||||
			if (!skb_shinfo(skb)->nr_frags) {
 | 
			
		||||
				skb_pull(skb, copy);
 | 
			
		||||
				if (skb->ip_summed != CHECKSUM_HW)
 | 
			
		||||
					skb->csum = csum_partial(skb->data, skb->len, 0);
 | 
			
		||||
			} else {
 | 
			
		||||
				__pskb_trim_head(skb, copy);
 | 
			
		||||
				tcp_set_skb_tso_segs(sk, skb, mss_now);
 | 
			
		||||
			}
 | 
			
		||||
			TCP_SKB_CB(skb)->seq += copy;
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		len += copy;
 | 
			
		||||
		skb = next;
 | 
			
		||||
	}
 | 
			
		||||
	tcp_init_tso_segs(sk, nskb, nskb->len);
 | 
			
		||||
 | 
			
		||||
	/* We're ready to send.  If this fails, the probe will
 | 
			
		||||
	 * be resegmented into mss-sized pieces by tcp_write_xmit(). */
 | 
			
		||||
	TCP_SKB_CB(nskb)->when = tcp_time_stamp;
 | 
			
		||||
	if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
 | 
			
		||||
		/* Decrement cwnd here because we are sending
 | 
			
		||||
		* effectively two packets. */
 | 
			
		||||
		tp->snd_cwnd--;
 | 
			
		||||
		update_send_head(sk, tp, nskb);
 | 
			
		||||
 | 
			
		||||
		icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
 | 
			
		||||
		icsk->icsk_mtup.probe_seq_start = TCP_SKB_CB(nskb)->seq;
 | 
			
		||||
		icsk->icsk_mtup.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
 | 
			
		||||
 | 
			
		||||
		return 1;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return -1;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
/* This routine writes packets to the network.  It advances the
 | 
			
		||||
 * send_head.  This happens as incoming acks open up the remote
 | 
			
		||||
 * window for us.
 | 
			
		||||
| 
						 | 
				
			
			@ -1076,6 +1261,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
 | 
			
		|||
	struct sk_buff *skb;
 | 
			
		||||
	unsigned int tso_segs, sent_pkts;
 | 
			
		||||
	int cwnd_quota;
 | 
			
		||||
	int result;
 | 
			
		||||
 | 
			
		||||
	/* If we are closed, the bytes will have to remain here.
 | 
			
		||||
	 * In time closedown will finish, we empty the write queue and all
 | 
			
		||||
| 
						 | 
				
			
			@ -1085,6 +1271,14 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
 | 
			
		|||
		return 0;
 | 
			
		||||
 | 
			
		||||
	sent_pkts = 0;
 | 
			
		||||
 | 
			
		||||
	/* Do MTU probing. */
 | 
			
		||||
	if ((result = tcp_mtu_probe(sk)) == 0) {
 | 
			
		||||
		return 0;
 | 
			
		||||
	} else if (result > 0) {
 | 
			
		||||
		sent_pkts = 1;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	while ((skb = sk->sk_send_head)) {
 | 
			
		||||
		unsigned int limit;
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -1455,9 +1649,15 @@ void tcp_simple_retransmit(struct sock *sk)
 | 
			
		|||
int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 | 
			
		||||
{
 | 
			
		||||
	struct tcp_sock *tp = tcp_sk(sk);
 | 
			
		||||
	struct inet_connection_sock *icsk = inet_csk(sk);
 | 
			
		||||
 	unsigned int cur_mss = tcp_current_mss(sk, 0);
 | 
			
		||||
	int err;
 | 
			
		||||
 | 
			
		||||
	/* Inconslusive MTU probe */
 | 
			
		||||
	if (icsk->icsk_mtup.probe_size) {
 | 
			
		||||
		icsk->icsk_mtup.probe_size = 0;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	/* Do not sent more than we queued. 1/4 is reserved for possible
 | 
			
		||||
	 * copying overhead: fragmentation, tunneling, mangling etc.
 | 
			
		||||
	 */
 | 
			
		||||
| 
						 | 
				
			
			@ -1883,6 +2083,7 @@ static void tcp_connect_init(struct sock *sk)
 | 
			
		|||
	if (tp->rx_opt.user_mss)
 | 
			
		||||
		tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
 | 
			
		||||
	tp->max_window = 0;
 | 
			
		||||
	tcp_mtup_init(sk);
 | 
			
		||||
	tcp_sync_mss(sk, dst_mtu(dst));
 | 
			
		||||
 | 
			
		||||
	if (!tp->window_clamp)
 | 
			
		||||
| 
						 | 
				
			
			@ -2180,3 +2381,4 @@ EXPORT_SYMBOL(tcp_make_synack);
 | 
			
		|||
EXPORT_SYMBOL(tcp_simple_retransmit);
 | 
			
		||||
EXPORT_SYMBOL(tcp_sync_mss);
 | 
			
		||||
EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor);
 | 
			
		||||
EXPORT_SYMBOL(tcp_mtup_init);
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -119,8 +119,10 @@ static int tcp_orphan_retries(struct sock *sk, int alive)
 | 
			
		|||
/* A write timeout has occurred. Process the after effects. */
 | 
			
		||||
static int tcp_write_timeout(struct sock *sk)
 | 
			
		||||
{
 | 
			
		||||
	const struct inet_connection_sock *icsk = inet_csk(sk);
 | 
			
		||||
	struct inet_connection_sock *icsk = inet_csk(sk);
 | 
			
		||||
	struct tcp_sock *tp = tcp_sk(sk);
 | 
			
		||||
	int retry_until;
 | 
			
		||||
	int mss;
 | 
			
		||||
 | 
			
		||||
	if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
 | 
			
		||||
		if (icsk->icsk_retransmits)
 | 
			
		||||
| 
						 | 
				
			
			@ -128,25 +130,19 @@ static int tcp_write_timeout(struct sock *sk)
 | 
			
		|||
		retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
 | 
			
		||||
	} else {
 | 
			
		||||
		if (icsk->icsk_retransmits >= sysctl_tcp_retries1) {
 | 
			
		||||
			/* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
 | 
			
		||||
			   hole detection. :-(
 | 
			
		||||
 | 
			
		||||
			   It is place to make it. It is not made. I do not want
 | 
			
		||||
			   to make it. It is disgusting. It does not work in any
 | 
			
		||||
			   case. Let me to cite the same draft, which requires for
 | 
			
		||||
			   us to implement this:
 | 
			
		||||
 | 
			
		||||
   "The one security concern raised by this memo is that ICMP black holes
 | 
			
		||||
   are often caused by over-zealous security administrators who block
 | 
			
		||||
   all ICMP messages.  It is vitally important that those who design and
 | 
			
		||||
   deploy security systems understand the impact of strict filtering on
 | 
			
		||||
   upper-layer protocols.  The safest web site in the world is worthless
 | 
			
		||||
   if most TCP implementations cannot transfer data from it.  It would
 | 
			
		||||
   be far nicer to have all of the black holes fixed rather than fixing
 | 
			
		||||
   all of the TCP implementations."
 | 
			
		||||
 | 
			
		||||
                           Golden words :-).
 | 
			
		||||
		   */
 | 
			
		||||
			/* Black hole detection */
 | 
			
		||||
			if (sysctl_tcp_mtu_probing) {
 | 
			
		||||
				if (!icsk->icsk_mtup.enabled) {
 | 
			
		||||
					icsk->icsk_mtup.enabled = 1;
 | 
			
		||||
					tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
 | 
			
		||||
				} else {
 | 
			
		||||
					mss = min(sysctl_tcp_base_mss,
 | 
			
		||||
					          tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low)/2);
 | 
			
		||||
					mss = max(mss, 68 - tp->tcp_header_len);
 | 
			
		||||
					icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
 | 
			
		||||
					tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
 | 
			
		||||
			dst_negative_advice(&sk->sk_dst_cache);
 | 
			
		||||
		}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -987,6 +987,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 | 
			
		|||
		inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen +
 | 
			
		||||
						     newnp->opt->opt_flen);
 | 
			
		||||
 | 
			
		||||
	tcp_mtup_init(newsk);
 | 
			
		||||
	tcp_sync_mss(newsk, dst_mtu(dst));
 | 
			
		||||
	newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
 | 
			
		||||
	tcp_initialize_rcv_mss(newsk);
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue