mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 10:40:15 +02:00 
			
		
		
		
	net/rds: RDS-TCP: Always create a new rds_sock for an incoming connection.
When running RDS over TCP, the active (client) side connects to the
listening ("passive") side at the RDS_TCP_PORT.  After the connection
is established, if the client side reboots (potentially without even
sending a FIN) the server still has a TCP socket in the esablished
state.  If the server-side now gets a new SYN comes from the client
with a different client port, TCP will create a new socket-pair, but
the RDS layer will incorrectly pull up the old rds_connection (which
is still associated with the stale t_sock and RDS socket state).
This patch corrects this behavior by having rds_tcp_accept_one()
always create a new connection for an incoming TCP SYN.
The rds and tcp state associated with the old socket-pair is cleaned
up via the rds_tcp_state_change() callback which would typically be
invoked in most cases when the client-TCP sends a FIN on TCP restart,
triggering a transition to CLOSE_WAIT state. In the rarer event of client
death without a FIN, TCP_KEEPALIVE probes on the socket will detect
the stale socket, and the TCP transition to CLOSE state will trigger
the RDS state cleanup.
Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
			
			
This commit is contained in:
		
							parent
							
								
									e16e888b52
								
							
						
					
					
						commit
						f711a6ae06
					
				
					 3 changed files with 51 additions and 0 deletions
				
			
		| 
						 | 
					@ -126,7 +126,10 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
 | 
				
			||||||
	struct rds_transport *loop_trans;
 | 
						struct rds_transport *loop_trans;
 | 
				
			||||||
	unsigned long flags;
 | 
						unsigned long flags;
 | 
				
			||||||
	int ret;
 | 
						int ret;
 | 
				
			||||||
 | 
						struct rds_transport *otrans = trans;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP)
 | 
				
			||||||
 | 
							goto new_conn;
 | 
				
			||||||
	rcu_read_lock();
 | 
						rcu_read_lock();
 | 
				
			||||||
	conn = rds_conn_lookup(head, laddr, faddr, trans);
 | 
						conn = rds_conn_lookup(head, laddr, faddr, trans);
 | 
				
			||||||
	if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport &&
 | 
						if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport &&
 | 
				
			||||||
| 
						 | 
					@ -142,6 +145,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
 | 
				
			||||||
	if (conn)
 | 
						if (conn)
 | 
				
			||||||
		goto out;
 | 
							goto out;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					new_conn:
 | 
				
			||||||
	conn = kmem_cache_zalloc(rds_conn_slab, gfp);
 | 
						conn = kmem_cache_zalloc(rds_conn_slab, gfp);
 | 
				
			||||||
	if (!conn) {
 | 
						if (!conn) {
 | 
				
			||||||
		conn = ERR_PTR(-ENOMEM);
 | 
							conn = ERR_PTR(-ENOMEM);
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -62,6 +62,7 @@ void rds_tcp_state_change(struct sock *sk)
 | 
				
			||||||
		case TCP_ESTABLISHED:
 | 
							case TCP_ESTABLISHED:
 | 
				
			||||||
			rds_connect_complete(conn);
 | 
								rds_connect_complete(conn);
 | 
				
			||||||
			break;
 | 
								break;
 | 
				
			||||||
 | 
							case TCP_CLOSE_WAIT:
 | 
				
			||||||
		case TCP_CLOSE:
 | 
							case TCP_CLOSE:
 | 
				
			||||||
			rds_conn_drop(conn);
 | 
								rds_conn_drop(conn);
 | 
				
			||||||
		default:
 | 
							default:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -45,12 +45,45 @@ static void rds_tcp_accept_worker(struct work_struct *work);
 | 
				
			||||||
static DECLARE_WORK(rds_tcp_listen_work, rds_tcp_accept_worker);
 | 
					static DECLARE_WORK(rds_tcp_listen_work, rds_tcp_accept_worker);
 | 
				
			||||||
static struct socket *rds_tcp_listen_sock;
 | 
					static struct socket *rds_tcp_listen_sock;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static int rds_tcp_keepalive(struct socket *sock)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						/* values below based on xs_udp_default_timeout */
 | 
				
			||||||
 | 
						int keepidle = 5; /* send a probe 'keepidle' secs after last data */
 | 
				
			||||||
 | 
						int keepcnt = 5; /* number of unack'ed probes before declaring dead */
 | 
				
			||||||
 | 
						int keepalive = 1;
 | 
				
			||||||
 | 
						int ret = 0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						ret = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
 | 
				
			||||||
 | 
									(char *)&keepalive, sizeof(keepalive));
 | 
				
			||||||
 | 
						if (ret < 0)
 | 
				
			||||||
 | 
							goto bail;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPCNT,
 | 
				
			||||||
 | 
									(char *)&keepcnt, sizeof(keepcnt));
 | 
				
			||||||
 | 
						if (ret < 0)
 | 
				
			||||||
 | 
							goto bail;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPIDLE,
 | 
				
			||||||
 | 
									(char *)&keepidle, sizeof(keepidle));
 | 
				
			||||||
 | 
						if (ret < 0)
 | 
				
			||||||
 | 
							goto bail;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/* KEEPINTVL is the interval between successive probes. We follow
 | 
				
			||||||
 | 
						 * the model in xs_tcp_finish_connecting() and re-use keepidle.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPINTVL,
 | 
				
			||||||
 | 
									(char *)&keepidle, sizeof(keepidle));
 | 
				
			||||||
 | 
					bail:
 | 
				
			||||||
 | 
						return ret;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static int rds_tcp_accept_one(struct socket *sock)
 | 
					static int rds_tcp_accept_one(struct socket *sock)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	struct socket *new_sock = NULL;
 | 
						struct socket *new_sock = NULL;
 | 
				
			||||||
	struct rds_connection *conn;
 | 
						struct rds_connection *conn;
 | 
				
			||||||
	int ret;
 | 
						int ret;
 | 
				
			||||||
	struct inet_sock *inet;
 | 
						struct inet_sock *inet;
 | 
				
			||||||
 | 
						struct rds_tcp_connection *rs_tcp;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type,
 | 
						ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type,
 | 
				
			||||||
			       sock->sk->sk_protocol, &new_sock);
 | 
								       sock->sk->sk_protocol, &new_sock);
 | 
				
			||||||
| 
						 | 
					@ -63,6 +96,10 @@ static int rds_tcp_accept_one(struct socket *sock)
 | 
				
			||||||
	if (ret < 0)
 | 
						if (ret < 0)
 | 
				
			||||||
		goto out;
 | 
							goto out;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						ret = rds_tcp_keepalive(new_sock);
 | 
				
			||||||
 | 
						if (ret < 0)
 | 
				
			||||||
 | 
							goto out;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	rds_tcp_tune(new_sock);
 | 
						rds_tcp_tune(new_sock);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	inet = inet_sk(new_sock->sk);
 | 
						inet = inet_sk(new_sock->sk);
 | 
				
			||||||
| 
						 | 
					@ -77,6 +114,15 @@ static int rds_tcp_accept_one(struct socket *sock)
 | 
				
			||||||
		ret = PTR_ERR(conn);
 | 
							ret = PTR_ERR(conn);
 | 
				
			||||||
		goto out;
 | 
							goto out;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
						/* An incoming SYN request came in, and TCP just accepted it.
 | 
				
			||||||
 | 
						 * We always create a new conn for listen side of TCP, and do not
 | 
				
			||||||
 | 
						 * add it to the c_hash_list.
 | 
				
			||||||
 | 
						 *
 | 
				
			||||||
 | 
						 * If the client reboots, this conn will need to be cleaned up.
 | 
				
			||||||
 | 
						 * rds_tcp_state_change() will do that cleanup
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						rs_tcp = (struct rds_tcp_connection *)conn->c_transport_data;
 | 
				
			||||||
 | 
						WARN_ON(!rs_tcp || rs_tcp->t_sock);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/*
 | 
						/*
 | 
				
			||||||
	 * see the comment above rds_queue_delayed_reconnect()
 | 
						 * see the comment above rds_queue_delayed_reconnect()
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue