forked from mirrors/linux
		
	RDS/IB: Infiniband transport
Registers as an RDS transport and an IB client, and uses IB CM API to allocate ids, queue pairs, and the rest of that fun stuff. Signed-off-by: Andy Grover <andy.grover@oracle.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
		
							parent
							
								
									eff5f53bef
								
							
						
					
					
						commit
						ec16227e14
					
				
					 3 changed files with 1416 additions and 0 deletions
				
			
		
							
								
								
									
										323
									
								
								net/rds/ib.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										323
									
								
								net/rds/ib.c
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,323 @@ | |||
| /*
 | ||||
|  * Copyright (c) 2006 Oracle.  All rights reserved. | ||||
|  * | ||||
|  * This software is available to you under a choice of one of two | ||||
|  * licenses.  You may choose to be licensed under the terms of the GNU | ||||
|  * General Public License (GPL) Version 2, available from the file | ||||
|  * COPYING in the main directory of this source tree, or the | ||||
|  * OpenIB.org BSD license below: | ||||
|  * | ||||
|  *     Redistribution and use in source and binary forms, with or | ||||
|  *     without modification, are permitted provided that the following | ||||
|  *     conditions are met: | ||||
|  * | ||||
|  *      - Redistributions of source code must retain the above | ||||
|  *        copyright notice, this list of conditions and the following | ||||
|  *        disclaimer. | ||||
|  * | ||||
|  *      - Redistributions in binary form must reproduce the above | ||||
|  *        copyright notice, this list of conditions and the following | ||||
|  *        disclaimer in the documentation and/or other materials | ||||
|  *        provided with the distribution. | ||||
|  * | ||||
|  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||||
|  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||||
|  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||||
|  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||||
|  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||||
|  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||||
|  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||||
|  * SOFTWARE. | ||||
|  * | ||||
|  */ | ||||
| #include <linux/kernel.h> | ||||
| #include <linux/in.h> | ||||
| #include <linux/if.h> | ||||
| #include <linux/netdevice.h> | ||||
| #include <linux/inetdevice.h> | ||||
| #include <linux/if_arp.h> | ||||
| #include <linux/delay.h> | ||||
| 
 | ||||
| #include "rds.h" | ||||
| #include "ib.h" | ||||
| 
 | ||||
| unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE; | ||||
| unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */ | ||||
| 
 | ||||
| module_param(fmr_pool_size, int, 0444); | ||||
| MODULE_PARM_DESC(fmr_pool_size, " Max number of fmr per HCA"); | ||||
| module_param(fmr_message_size, int, 0444); | ||||
| MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer"); | ||||
| 
 | ||||
| struct list_head rds_ib_devices; | ||||
| 
 | ||||
| DEFINE_SPINLOCK(ib_nodev_conns_lock); | ||||
| LIST_HEAD(ib_nodev_conns); | ||||
| 
 | ||||
| void rds_ib_add_one(struct ib_device *device) | ||||
| { | ||||
| 	struct rds_ib_device *rds_ibdev; | ||||
| 	struct ib_device_attr *dev_attr; | ||||
| 
 | ||||
| 	/* Only handle IB (no iWARP) devices */ | ||||
| 	if (device->node_type != RDMA_NODE_IB_CA) | ||||
| 		return; | ||||
| 
 | ||||
| 	dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL); | ||||
| 	if (!dev_attr) | ||||
| 		return; | ||||
| 
 | ||||
| 	if (ib_query_device(device, dev_attr)) { | ||||
| 		rdsdebug("Query device failed for %s\n", device->name); | ||||
| 		goto free_attr; | ||||
| 	} | ||||
| 
 | ||||
| 	rds_ibdev = kmalloc(sizeof *rds_ibdev, GFP_KERNEL); | ||||
| 	if (!rds_ibdev) | ||||
| 		goto free_attr; | ||||
| 
 | ||||
| 	spin_lock_init(&rds_ibdev->spinlock); | ||||
| 
 | ||||
| 	rds_ibdev->max_wrs = dev_attr->max_qp_wr; | ||||
| 	rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE); | ||||
| 
 | ||||
| 	rds_ibdev->fmr_page_shift = max(9, ffs(dev_attr->page_size_cap) - 1); | ||||
| 	rds_ibdev->fmr_page_size  = 1 << rds_ibdev->fmr_page_shift; | ||||
| 	rds_ibdev->fmr_page_mask  = ~((u64) rds_ibdev->fmr_page_size - 1); | ||||
| 	rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32; | ||||
| 	rds_ibdev->max_fmrs = dev_attr->max_fmr ? | ||||
| 			min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) : | ||||
| 			fmr_pool_size; | ||||
| 
 | ||||
| 	rds_ibdev->dev = device; | ||||
| 	rds_ibdev->pd = ib_alloc_pd(device); | ||||
| 	if (IS_ERR(rds_ibdev->pd)) | ||||
| 		goto free_dev; | ||||
| 
 | ||||
| 	rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, | ||||
| 				      IB_ACCESS_LOCAL_WRITE); | ||||
| 	if (IS_ERR(rds_ibdev->mr)) | ||||
| 		goto err_pd; | ||||
| 
 | ||||
| 	rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev); | ||||
| 	if (IS_ERR(rds_ibdev->mr_pool)) { | ||||
| 		rds_ibdev->mr_pool = NULL; | ||||
| 		goto err_mr; | ||||
| 	} | ||||
| 
 | ||||
| 	INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); | ||||
| 	INIT_LIST_HEAD(&rds_ibdev->conn_list); | ||||
| 	list_add_tail(&rds_ibdev->list, &rds_ib_devices); | ||||
| 
 | ||||
| 	ib_set_client_data(device, &rds_ib_client, rds_ibdev); | ||||
| 
 | ||||
| 	goto free_attr; | ||||
| 
 | ||||
| err_mr: | ||||
| 	ib_dereg_mr(rds_ibdev->mr); | ||||
| err_pd: | ||||
| 	ib_dealloc_pd(rds_ibdev->pd); | ||||
| free_dev: | ||||
| 	kfree(rds_ibdev); | ||||
| free_attr: | ||||
| 	kfree(dev_attr); | ||||
| } | ||||
| 
 | ||||
| void rds_ib_remove_one(struct ib_device *device) | ||||
| { | ||||
| 	struct rds_ib_device *rds_ibdev; | ||||
| 	struct rds_ib_ipaddr *i_ipaddr, *i_next; | ||||
| 
 | ||||
| 	rds_ibdev = ib_get_client_data(device, &rds_ib_client); | ||||
| 	if (!rds_ibdev) | ||||
| 		return; | ||||
| 
 | ||||
| 	list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) { | ||||
| 		list_del(&i_ipaddr->list); | ||||
| 		kfree(i_ipaddr); | ||||
| 	} | ||||
| 
 | ||||
| 	rds_ib_remove_conns(rds_ibdev); | ||||
| 
 | ||||
| 	if (rds_ibdev->mr_pool) | ||||
| 		rds_ib_destroy_mr_pool(rds_ibdev->mr_pool); | ||||
| 
 | ||||
| 	ib_dereg_mr(rds_ibdev->mr); | ||||
| 
 | ||||
| 	while (ib_dealloc_pd(rds_ibdev->pd)) { | ||||
| 		rdsdebug("Failed to dealloc pd %p\n", rds_ibdev->pd); | ||||
| 		msleep(1); | ||||
| 	} | ||||
| 
 | ||||
| 	list_del(&rds_ibdev->list); | ||||
| 	kfree(rds_ibdev); | ||||
| } | ||||
| 
 | ||||
| struct ib_client rds_ib_client = { | ||||
| 	.name   = "rds_ib", | ||||
| 	.add    = rds_ib_add_one, | ||||
| 	.remove = rds_ib_remove_one | ||||
| }; | ||||
| 
 | ||||
| static int rds_ib_conn_info_visitor(struct rds_connection *conn, | ||||
| 				    void *buffer) | ||||
| { | ||||
| 	struct rds_info_rdma_connection *iinfo = buffer; | ||||
| 	struct rds_ib_connection *ic; | ||||
| 
 | ||||
| 	/* We will only ever look at IB transports */ | ||||
| 	if (conn->c_trans != &rds_ib_transport) | ||||
| 		return 0; | ||||
| 
 | ||||
| 	iinfo->src_addr = conn->c_laddr; | ||||
| 	iinfo->dst_addr = conn->c_faddr; | ||||
| 
 | ||||
| 	memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid)); | ||||
| 	memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid)); | ||||
| 	if (rds_conn_state(conn) == RDS_CONN_UP) { | ||||
| 		struct rds_ib_device *rds_ibdev; | ||||
| 		struct rdma_dev_addr *dev_addr; | ||||
| 
 | ||||
| 		ic = conn->c_transport_data; | ||||
| 		dev_addr = &ic->i_cm_id->route.addr.dev_addr; | ||||
| 
 | ||||
| 		ib_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); | ||||
| 		ib_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); | ||||
| 
 | ||||
| 		rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); | ||||
| 		iinfo->max_send_wr = ic->i_send_ring.w_nr; | ||||
| 		iinfo->max_recv_wr = ic->i_recv_ring.w_nr; | ||||
| 		iinfo->max_send_sge = rds_ibdev->max_sge; | ||||
| 		rds_ib_get_mr_info(rds_ibdev, iinfo); | ||||
| 	} | ||||
| 	return 1; | ||||
| } | ||||
| 
 | ||||
| static void rds_ib_ic_info(struct socket *sock, unsigned int len, | ||||
| 			   struct rds_info_iterator *iter, | ||||
| 			   struct rds_info_lengths *lens) | ||||
| { | ||||
| 	rds_for_each_conn_info(sock, len, iter, lens, | ||||
| 				rds_ib_conn_info_visitor, | ||||
| 				sizeof(struct rds_info_rdma_connection)); | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| /*
 | ||||
|  * Early RDS/IB was built to only bind to an address if there is an IPoIB | ||||
|  * device with that address set. | ||||
|  * | ||||
|  * If it were me, I'd advocate for something more flexible.  Sending and | ||||
|  * receiving should be device-agnostic.  Transports would try and maintain | ||||
|  * connections between peers who have messages queued.  Userspace would be | ||||
|  * allowed to influence which paths have priority.  We could call userspace | ||||
|  * asserting this policy "routing". | ||||
|  */ | ||||
| static int rds_ib_laddr_check(__be32 addr) | ||||
| { | ||||
| 	int ret; | ||||
| 	struct rdma_cm_id *cm_id; | ||||
| 	struct sockaddr_in sin; | ||||
| 
 | ||||
| 	/* Create a CMA ID and try to bind it. This catches both
 | ||||
| 	 * IB and iWARP capable NICs. | ||||
| 	 */ | ||||
| 	cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP); | ||||
| 	if (!cm_id) | ||||
| 		return -EADDRNOTAVAIL; | ||||
| 
 | ||||
| 	memset(&sin, 0, sizeof(sin)); | ||||
| 	sin.sin_family = AF_INET; | ||||
| 	sin.sin_addr.s_addr = addr; | ||||
| 
 | ||||
| 	/* rdma_bind_addr will only succeed for IB & iWARP devices */ | ||||
| 	ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); | ||||
| 	/* due to this, we will claim to support iWARP devices unless we
 | ||||
| 	   check node_type. */ | ||||
| 	if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA) | ||||
| 		ret = -EADDRNOTAVAIL; | ||||
| 
 | ||||
| 	rdsdebug("addr %pI4 ret %d node type %d\n", | ||||
| 		&addr, ret, | ||||
| 		cm_id->device ? cm_id->device->node_type : -1); | ||||
| 
 | ||||
| 	rdma_destroy_id(cm_id); | ||||
| 
 | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| void rds_ib_exit(void) | ||||
| { | ||||
| 	rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); | ||||
| 	rds_ib_remove_nodev_conns(); | ||||
| 	ib_unregister_client(&rds_ib_client); | ||||
| 	rds_ib_sysctl_exit(); | ||||
| 	rds_ib_recv_exit(); | ||||
| 	rds_trans_unregister(&rds_ib_transport); | ||||
| } | ||||
| 
 | ||||
| struct rds_transport rds_ib_transport = { | ||||
| 	.laddr_check		= rds_ib_laddr_check, | ||||
| 	.xmit_complete		= rds_ib_xmit_complete, | ||||
| 	.xmit			= rds_ib_xmit, | ||||
| 	.xmit_cong_map		= NULL, | ||||
| 	.xmit_rdma		= rds_ib_xmit_rdma, | ||||
| 	.recv			= rds_ib_recv, | ||||
| 	.conn_alloc		= rds_ib_conn_alloc, | ||||
| 	.conn_free		= rds_ib_conn_free, | ||||
| 	.conn_connect		= rds_ib_conn_connect, | ||||
| 	.conn_shutdown		= rds_ib_conn_shutdown, | ||||
| 	.inc_copy_to_user	= rds_ib_inc_copy_to_user, | ||||
| 	.inc_purge		= rds_ib_inc_purge, | ||||
| 	.inc_free		= rds_ib_inc_free, | ||||
| 	.cm_initiate_connect	= rds_ib_cm_initiate_connect, | ||||
| 	.cm_handle_connect	= rds_ib_cm_handle_connect, | ||||
| 	.cm_connect_complete	= rds_ib_cm_connect_complete, | ||||
| 	.stats_info_copy	= rds_ib_stats_info_copy, | ||||
| 	.exit			= rds_ib_exit, | ||||
| 	.get_mr			= rds_ib_get_mr, | ||||
| 	.sync_mr		= rds_ib_sync_mr, | ||||
| 	.free_mr		= rds_ib_free_mr, | ||||
| 	.flush_mrs		= rds_ib_flush_mrs, | ||||
| 	.t_owner		= THIS_MODULE, | ||||
| 	.t_name			= "infiniband", | ||||
| }; | ||||
| 
 | ||||
| int __init rds_ib_init(void) | ||||
| { | ||||
| 	int ret; | ||||
| 
 | ||||
| 	INIT_LIST_HEAD(&rds_ib_devices); | ||||
| 
 | ||||
| 	ret = ib_register_client(&rds_ib_client); | ||||
| 	if (ret) | ||||
| 		goto out; | ||||
| 
 | ||||
| 	ret = rds_ib_sysctl_init(); | ||||
| 	if (ret) | ||||
| 		goto out_ibreg; | ||||
| 
 | ||||
| 	ret = rds_ib_recv_init(); | ||||
| 	if (ret) | ||||
| 		goto out_sysctl; | ||||
| 
 | ||||
| 	ret = rds_trans_register(&rds_ib_transport); | ||||
| 	if (ret) | ||||
| 		goto out_recv; | ||||
| 
 | ||||
| 	rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); | ||||
| 
 | ||||
| 	goto out; | ||||
| 
 | ||||
| out_recv: | ||||
| 	rds_ib_recv_exit(); | ||||
| out_sysctl: | ||||
| 	rds_ib_sysctl_exit(); | ||||
| out_ibreg: | ||||
| 	ib_unregister_client(&rds_ib_client); | ||||
| out: | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| MODULE_LICENSE("GPL"); | ||||
| 
 | ||||
							
								
								
									
										367
									
								
								net/rds/ib.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										367
									
								
								net/rds/ib.h
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,367 @@ | |||
| #ifndef _RDS_IB_H | ||||
| #define _RDS_IB_H | ||||
| 
 | ||||
| #include <rdma/ib_verbs.h> | ||||
| #include <rdma/rdma_cm.h> | ||||
| #include "rds.h" | ||||
| #include "rdma_transport.h" | ||||
| 
 | ||||
| #define RDS_FMR_SIZE			256 | ||||
| #define RDS_FMR_POOL_SIZE		4096 | ||||
| 
 | ||||
| #define RDS_IB_MAX_SGE			8 | ||||
| #define RDS_IB_RECV_SGE 		2 | ||||
| 
 | ||||
| #define RDS_IB_DEFAULT_RECV_WR		1024 | ||||
| #define RDS_IB_DEFAULT_SEND_WR		256 | ||||
| 
 | ||||
| #define RDS_IB_SUPPORTED_PROTOCOLS	0x00000003	/* minor versions supported */ | ||||
| 
 | ||||
| extern struct list_head rds_ib_devices; | ||||
| 
 | ||||
| /*
 | ||||
|  * IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to | ||||
|  * try and minimize the amount of memory tied up both the device and | ||||
|  * socket receive queues. | ||||
|  */ | ||||
| /* page offset of the final full frag that fits in the page */ | ||||
| #define RDS_PAGE_LAST_OFF (((PAGE_SIZE  / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE) | ||||
| struct rds_page_frag { | ||||
| 	struct list_head	f_item; | ||||
| 	struct page		*f_page; | ||||
| 	unsigned long		f_offset; | ||||
| 	dma_addr_t 		f_mapped; | ||||
| }; | ||||
| 
 | ||||
| struct rds_ib_incoming { | ||||
| 	struct list_head	ii_frags; | ||||
| 	struct rds_incoming	ii_inc; | ||||
| }; | ||||
| 
 | ||||
| struct rds_ib_connect_private { | ||||
| 	/* Add new fields at the end, and don't permute existing fields. */ | ||||
| 	__be32			dp_saddr; | ||||
| 	__be32			dp_daddr; | ||||
| 	u8			dp_protocol_major; | ||||
| 	u8			dp_protocol_minor; | ||||
| 	__be16			dp_protocol_minor_mask; /* bitmask */ | ||||
| 	__be32			dp_reserved1; | ||||
| 	__be64			dp_ack_seq; | ||||
| 	__be32			dp_credit;		/* non-zero enables flow ctl */ | ||||
| }; | ||||
| 
 | ||||
| struct rds_ib_send_work { | ||||
| 	struct rds_message	*s_rm; | ||||
| 	struct rds_rdma_op	*s_op; | ||||
| 	struct ib_send_wr	s_wr; | ||||
| 	struct ib_sge		s_sge[RDS_IB_MAX_SGE]; | ||||
| 	unsigned long		s_queued; | ||||
| }; | ||||
| 
 | ||||
| struct rds_ib_recv_work { | ||||
| 	struct rds_ib_incoming 	*r_ibinc; | ||||
| 	struct rds_page_frag	*r_frag; | ||||
| 	struct ib_recv_wr	r_wr; | ||||
| 	struct ib_sge		r_sge[2]; | ||||
| }; | ||||
| 
 | ||||
| struct rds_ib_work_ring { | ||||
| 	u32		w_nr; | ||||
| 	u32		w_alloc_ptr; | ||||
| 	u32		w_alloc_ctr; | ||||
| 	u32		w_free_ptr; | ||||
| 	atomic_t	w_free_ctr; | ||||
| }; | ||||
| 
 | ||||
| struct rds_ib_device; | ||||
| 
 | ||||
| struct rds_ib_connection { | ||||
| 
 | ||||
| 	struct list_head	ib_node; | ||||
| 	struct rds_ib_device	*rds_ibdev; | ||||
| 	struct rds_connection	*conn; | ||||
| 
 | ||||
| 	/* alphabet soup, IBTA style */ | ||||
| 	struct rdma_cm_id	*i_cm_id; | ||||
| 	struct ib_pd		*i_pd; | ||||
| 	struct ib_mr		*i_mr; | ||||
| 	struct ib_cq		*i_send_cq; | ||||
| 	struct ib_cq		*i_recv_cq; | ||||
| 
 | ||||
| 	/* tx */ | ||||
| 	struct rds_ib_work_ring	i_send_ring; | ||||
| 	struct rds_message	*i_rm; | ||||
| 	struct rds_header	*i_send_hdrs; | ||||
| 	u64			i_send_hdrs_dma; | ||||
| 	struct rds_ib_send_work *i_sends; | ||||
| 
 | ||||
| 	/* rx */ | ||||
| 	struct mutex		i_recv_mutex; | ||||
| 	struct rds_ib_work_ring	i_recv_ring; | ||||
| 	struct rds_ib_incoming	*i_ibinc; | ||||
| 	u32			i_recv_data_rem; | ||||
| 	struct rds_header	*i_recv_hdrs; | ||||
| 	u64			i_recv_hdrs_dma; | ||||
| 	struct rds_ib_recv_work *i_recvs; | ||||
| 	struct rds_page_frag	i_frag; | ||||
| 	u64			i_ack_recv;	/* last ACK received */ | ||||
| 
 | ||||
| 	/* sending acks */ | ||||
| 	unsigned long		i_ack_flags; | ||||
| 	u64			i_ack_next;	/* next ACK to send */ | ||||
| 	struct rds_header	*i_ack; | ||||
| 	struct ib_send_wr	i_ack_wr; | ||||
| 	struct ib_sge		i_ack_sge; | ||||
| 	u64			i_ack_dma; | ||||
| 	unsigned long		i_ack_queued; | ||||
| 
 | ||||
| 	/* Flow control related information
 | ||||
| 	 * | ||||
| 	 * Our algorithm uses a pair variables that we need to access | ||||
| 	 * atomically - one for the send credits, and one posted | ||||
| 	 * recv credits we need to transfer to remote. | ||||
| 	 * Rather than protect them using a slow spinlock, we put both into | ||||
| 	 * a single atomic_t and update it using cmpxchg | ||||
| 	 */ | ||||
| 	atomic_t		i_credits; | ||||
| 
 | ||||
| 	/* Protocol version specific information */ | ||||
| 	unsigned int		i_flowctl:1;	/* enable/disable flow ctl */ | ||||
| 
 | ||||
| 	/* Batched completions */ | ||||
| 	unsigned int		i_unsignaled_wrs; | ||||
| 	long			i_unsignaled_bytes; | ||||
| }; | ||||
| 
 | ||||
| /* This assumes that atomic_t is at least 32 bits */ | ||||
| #define IB_GET_SEND_CREDITS(v)	((v) & 0xffff) | ||||
| #define IB_GET_POST_CREDITS(v)	((v) >> 16) | ||||
| #define IB_SET_SEND_CREDITS(v)	((v) & 0xffff) | ||||
| #define IB_SET_POST_CREDITS(v)	((v) << 16) | ||||
| 
 | ||||
| struct rds_ib_ipaddr { | ||||
| 	struct list_head	list; | ||||
| 	__be32			ipaddr; | ||||
| }; | ||||
| 
 | ||||
| struct rds_ib_device { | ||||
| 	struct list_head	list; | ||||
| 	struct list_head	ipaddr_list; | ||||
| 	struct list_head	conn_list; | ||||
| 	struct ib_device	*dev; | ||||
| 	struct ib_pd		*pd; | ||||
| 	struct ib_mr		*mr; | ||||
| 	struct rds_ib_mr_pool	*mr_pool; | ||||
| 	int			fmr_page_shift; | ||||
| 	int			fmr_page_size; | ||||
| 	u64			fmr_page_mask; | ||||
| 	unsigned int		fmr_max_remaps; | ||||
| 	unsigned int		max_fmrs; | ||||
| 	int			max_sge; | ||||
| 	unsigned int		max_wrs; | ||||
| 	spinlock_t		spinlock;	/* protect the above */ | ||||
| }; | ||||
| 
 | ||||
| /* bits for i_ack_flags */ | ||||
| #define IB_ACK_IN_FLIGHT	0 | ||||
| #define IB_ACK_REQUESTED	1 | ||||
| 
 | ||||
| /* Magic WR_ID for ACKs */ | ||||
| #define RDS_IB_ACK_WR_ID	(~(u64) 0) | ||||
| 
 | ||||
| struct rds_ib_statistics { | ||||
| 	uint64_t	s_ib_connect_raced; | ||||
| 	uint64_t	s_ib_listen_closed_stale; | ||||
| 	uint64_t	s_ib_tx_cq_call; | ||||
| 	uint64_t	s_ib_tx_cq_event; | ||||
| 	uint64_t	s_ib_tx_ring_full; | ||||
| 	uint64_t	s_ib_tx_throttle; | ||||
| 	uint64_t	s_ib_tx_sg_mapping_failure; | ||||
| 	uint64_t	s_ib_tx_stalled; | ||||
| 	uint64_t	s_ib_tx_credit_updates; | ||||
| 	uint64_t	s_ib_rx_cq_call; | ||||
| 	uint64_t	s_ib_rx_cq_event; | ||||
| 	uint64_t	s_ib_rx_ring_empty; | ||||
| 	uint64_t	s_ib_rx_refill_from_cq; | ||||
| 	uint64_t	s_ib_rx_refill_from_thread; | ||||
| 	uint64_t	s_ib_rx_alloc_limit; | ||||
| 	uint64_t	s_ib_rx_credit_updates; | ||||
| 	uint64_t	s_ib_ack_sent; | ||||
| 	uint64_t	s_ib_ack_send_failure; | ||||
| 	uint64_t	s_ib_ack_send_delayed; | ||||
| 	uint64_t	s_ib_ack_send_piggybacked; | ||||
| 	uint64_t	s_ib_ack_received; | ||||
| 	uint64_t	s_ib_rdma_mr_alloc; | ||||
| 	uint64_t	s_ib_rdma_mr_free; | ||||
| 	uint64_t	s_ib_rdma_mr_used; | ||||
| 	uint64_t	s_ib_rdma_mr_pool_flush; | ||||
| 	uint64_t	s_ib_rdma_mr_pool_wait; | ||||
| 	uint64_t	s_ib_rdma_mr_pool_depleted; | ||||
| }; | ||||
| 
 | ||||
| extern struct workqueue_struct *rds_ib_wq; | ||||
| 
 | ||||
| /*
 | ||||
|  * Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h | ||||
|  * doesn't define it. | ||||
|  */ | ||||
| static inline void rds_ib_dma_sync_sg_for_cpu(struct ib_device *dev, | ||||
| 		struct scatterlist *sg, unsigned int sg_dma_len, int direction) | ||||
| { | ||||
| 	unsigned int i; | ||||
| 
 | ||||
| 	for (i = 0; i < sg_dma_len; ++i) { | ||||
| 		ib_dma_sync_single_for_cpu(dev, | ||||
| 				ib_sg_dma_address(dev, &sg[i]), | ||||
| 				ib_sg_dma_len(dev, &sg[i]), | ||||
| 				direction); | ||||
| 	} | ||||
| } | ||||
| #define ib_dma_sync_sg_for_cpu	rds_ib_dma_sync_sg_for_cpu | ||||
| 
 | ||||
| static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev, | ||||
| 		struct scatterlist *sg, unsigned int sg_dma_len, int direction) | ||||
| { | ||||
| 	unsigned int i; | ||||
| 
 | ||||
| 	for (i = 0; i < sg_dma_len; ++i) { | ||||
| 		ib_dma_sync_single_for_device(dev, | ||||
| 				ib_sg_dma_address(dev, &sg[i]), | ||||
| 				ib_sg_dma_len(dev, &sg[i]), | ||||
| 				direction); | ||||
| 	} | ||||
| } | ||||
| #define ib_dma_sync_sg_for_device	rds_ib_dma_sync_sg_for_device | ||||
| 
 | ||||
| 
 | ||||
| /* ib.c */ | ||||
| extern struct rds_transport rds_ib_transport; | ||||
| extern void rds_ib_add_one(struct ib_device *device); | ||||
| extern void rds_ib_remove_one(struct ib_device *device); | ||||
| extern struct ib_client rds_ib_client; | ||||
| 
 | ||||
| extern unsigned int fmr_pool_size; | ||||
| extern unsigned int fmr_message_size; | ||||
| 
 | ||||
| extern spinlock_t ib_nodev_conns_lock; | ||||
| extern struct list_head ib_nodev_conns; | ||||
| 
 | ||||
| /* ib_cm.c */ | ||||
| int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp); | ||||
| void rds_ib_conn_free(void *arg); | ||||
| int rds_ib_conn_connect(struct rds_connection *conn); | ||||
| void rds_ib_conn_shutdown(struct rds_connection *conn); | ||||
| void rds_ib_state_change(struct sock *sk); | ||||
| int __init rds_ib_listen_init(void); | ||||
| void rds_ib_listen_stop(void); | ||||
| void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...); | ||||
| int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, | ||||
| 			     struct rdma_cm_event *event); | ||||
| int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id); | ||||
| void rds_ib_cm_connect_complete(struct rds_connection *conn, | ||||
| 				struct rdma_cm_event *event); | ||||
| 
 | ||||
| 
 | ||||
| #define rds_ib_conn_error(conn, fmt...) \ | ||||
| 	__rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt) | ||||
| 
 | ||||
| /* ib_rdma.c */ | ||||
| int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr); | ||||
| int rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); | ||||
| void rds_ib_remove_nodev_conns(void); | ||||
| void rds_ib_remove_conns(struct rds_ib_device *rds_ibdev); | ||||
| struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *); | ||||
| void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo); | ||||
| void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); | ||||
| void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, | ||||
| 		    struct rds_sock *rs, u32 *key_ret); | ||||
| void rds_ib_sync_mr(void *trans_private, int dir); | ||||
| void rds_ib_free_mr(void *trans_private, int invalidate); | ||||
| void rds_ib_flush_mrs(void); | ||||
| 
 | ||||
| /* ib_recv.c */ | ||||
| int __init rds_ib_recv_init(void); | ||||
| void rds_ib_recv_exit(void); | ||||
| int rds_ib_recv(struct rds_connection *conn); | ||||
| int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, | ||||
| 		       gfp_t page_gfp, int prefill); | ||||
| void rds_ib_inc_purge(struct rds_incoming *inc); | ||||
| void rds_ib_inc_free(struct rds_incoming *inc); | ||||
| int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, | ||||
| 			     size_t size); | ||||
| void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context); | ||||
| void rds_ib_recv_init_ring(struct rds_ib_connection *ic); | ||||
| void rds_ib_recv_clear_ring(struct rds_ib_connection *ic); | ||||
| void rds_ib_recv_init_ack(struct rds_ib_connection *ic); | ||||
| void rds_ib_attempt_ack(struct rds_ib_connection *ic); | ||||
| void rds_ib_ack_send_complete(struct rds_ib_connection *ic); | ||||
| u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic); | ||||
| 
 | ||||
| /* ib_ring.c */ | ||||
| void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr); | ||||
| void rds_ib_ring_resize(struct rds_ib_work_ring *ring, u32 nr); | ||||
| u32 rds_ib_ring_alloc(struct rds_ib_work_ring *ring, u32 val, u32 *pos); | ||||
| void rds_ib_ring_free(struct rds_ib_work_ring *ring, u32 val); | ||||
| void rds_ib_ring_unalloc(struct rds_ib_work_ring *ring, u32 val); | ||||
| int rds_ib_ring_empty(struct rds_ib_work_ring *ring); | ||||
| int rds_ib_ring_low(struct rds_ib_work_ring *ring); | ||||
| u32 rds_ib_ring_oldest(struct rds_ib_work_ring *ring); | ||||
| u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest); | ||||
| extern wait_queue_head_t rds_ib_ring_empty_wait; | ||||
| 
 | ||||
| /* ib_send.c */ | ||||
| void rds_ib_xmit_complete(struct rds_connection *conn); | ||||
| int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, | ||||
| 		unsigned int hdr_off, unsigned int sg, unsigned int off); | ||||
| void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context); | ||||
| void rds_ib_send_init_ring(struct rds_ib_connection *ic); | ||||
| void rds_ib_send_clear_ring(struct rds_ib_connection *ic); | ||||
| int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op); | ||||
| void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits); | ||||
| void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted); | ||||
| int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted, | ||||
| 			     u32 *adv_credits, int need_posted); | ||||
| 
 | ||||
| /* ib_stats.c */ | ||||
| DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats); | ||||
| #define rds_ib_stats_inc(member) rds_stats_inc_which(rds_ib_stats, member) | ||||
| unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, | ||||
| 				    unsigned int avail); | ||||
| 
 | ||||
| /* ib_sysctl.c */ | ||||
| int __init rds_ib_sysctl_init(void); | ||||
| void rds_ib_sysctl_exit(void); | ||||
| extern unsigned long rds_ib_sysctl_max_send_wr; | ||||
| extern unsigned long rds_ib_sysctl_max_recv_wr; | ||||
| extern unsigned long rds_ib_sysctl_max_unsig_wrs; | ||||
| extern unsigned long rds_ib_sysctl_max_unsig_bytes; | ||||
| extern unsigned long rds_ib_sysctl_max_recv_allocation; | ||||
| extern unsigned int rds_ib_sysctl_flow_control; | ||||
| extern ctl_table rds_ib_sysctl_table[]; | ||||
| 
 | ||||
| /*
 | ||||
|  * Helper functions for getting/setting the header and data SGEs in | ||||
|  * RDS packets (not RDMA) | ||||
|  */ | ||||
| static inline struct ib_sge * | ||||
| rds_ib_header_sge(struct rds_ib_connection *ic, struct ib_sge *sge) | ||||
| { | ||||
| 	return &sge[0]; | ||||
| } | ||||
| 
 | ||||
| static inline struct ib_sge * | ||||
| rds_ib_data_sge(struct rds_ib_connection *ic, struct ib_sge *sge) | ||||
| { | ||||
| 	return &sge[1]; | ||||
| } | ||||
| 
 | ||||
| static inline void rds_ib_set_64bit(u64 *ptr, u64 val) | ||||
| { | ||||
| #if BITS_PER_LONG == 64 | ||||
| 	*ptr = val; | ||||
| #else | ||||
| 	set_64bit(ptr, val); | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| #endif | ||||
							
								
								
									
										726
									
								
								net/rds/ib_cm.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										726
									
								
								net/rds/ib_cm.c
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,726 @@ | |||
| /*
 | ||||
|  * Copyright (c) 2006 Oracle.  All rights reserved. | ||||
|  * | ||||
|  * This software is available to you under a choice of one of two | ||||
|  * licenses.  You may choose to be licensed under the terms of the GNU | ||||
|  * General Public License (GPL) Version 2, available from the file | ||||
|  * COPYING in the main directory of this source tree, or the | ||||
|  * OpenIB.org BSD license below: | ||||
|  * | ||||
|  *     Redistribution and use in source and binary forms, with or | ||||
|  *     without modification, are permitted provided that the following | ||||
|  *     conditions are met: | ||||
|  * | ||||
|  *      - Redistributions of source code must retain the above | ||||
|  *        copyright notice, this list of conditions and the following | ||||
|  *        disclaimer. | ||||
|  * | ||||
|  *      - Redistributions in binary form must reproduce the above | ||||
|  *        copyright notice, this list of conditions and the following | ||||
|  *        disclaimer in the documentation and/or other materials | ||||
|  *        provided with the distribution. | ||||
|  * | ||||
|  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||||
|  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||||
|  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||||
|  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||||
|  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||||
|  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||||
|  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||||
|  * SOFTWARE. | ||||
|  * | ||||
|  */ | ||||
| #include <linux/kernel.h> | ||||
| #include <linux/in.h> | ||||
| #include <linux/vmalloc.h> | ||||
| 
 | ||||
| #include "rds.h" | ||||
| #include "ib.h" | ||||
| 
 | ||||
| /*
 | ||||
|  * Set the selected protocol version | ||||
|  */ | ||||
| static void rds_ib_set_protocol(struct rds_connection *conn, unsigned int version) | ||||
| { | ||||
| 	conn->c_version = version; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Set up flow control | ||||
|  */ | ||||
| static void rds_ib_set_flow_control(struct rds_connection *conn, u32 credits) | ||||
| { | ||||
| 	struct rds_ib_connection *ic = conn->c_transport_data; | ||||
| 
 | ||||
| 	if (rds_ib_sysctl_flow_control && credits != 0) { | ||||
| 		/* We're doing flow control */ | ||||
| 		ic->i_flowctl = 1; | ||||
| 		rds_ib_send_add_credits(conn, credits); | ||||
| 	} else { | ||||
| 		ic->i_flowctl = 0; | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Tune RNR behavior. Without flow control, we use a rather | ||||
|  * low timeout, but not the absolute minimum - this should | ||||
|  * be tunable. | ||||
|  * | ||||
|  * We already set the RNR retry count to 7 (which is the | ||||
|  * smallest infinite number :-) above. | ||||
|  * If flow control is off, we want to change this back to 0 | ||||
|  * so that we learn quickly when our credit accounting is | ||||
|  * buggy. | ||||
|  * | ||||
|  * Caller passes in a qp_attr pointer - don't waste stack spacv | ||||
|  * by allocation this twice. | ||||
|  */ | ||||
| static void | ||||
| rds_ib_tune_rnr(struct rds_ib_connection *ic, struct ib_qp_attr *attr) | ||||
| { | ||||
| 	int ret; | ||||
| 
 | ||||
| 	attr->min_rnr_timer = IB_RNR_TIMER_000_32; | ||||
| 	ret = ib_modify_qp(ic->i_cm_id->qp, attr, IB_QP_MIN_RNR_TIMER); | ||||
| 	if (ret) | ||||
| 		printk(KERN_NOTICE "ib_modify_qp(IB_QP_MIN_RNR_TIMER): err=%d\n", -ret); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Connection established. | ||||
|  * We get here for both outgoing and incoming connection. | ||||
|  */ | ||||
| void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event) | ||||
| { | ||||
| 	const struct rds_ib_connect_private *dp = NULL; | ||||
| 	struct rds_ib_connection *ic = conn->c_transport_data; | ||||
| 	struct rds_ib_device *rds_ibdev; | ||||
| 	struct ib_qp_attr qp_attr; | ||||
| 	int err; | ||||
| 
 | ||||
| 	if (event->param.conn.private_data_len) { | ||||
| 		dp = event->param.conn.private_data; | ||||
| 
 | ||||
| 		rds_ib_set_protocol(conn, | ||||
| 				RDS_PROTOCOL(dp->dp_protocol_major, | ||||
| 					dp->dp_protocol_minor)); | ||||
| 		rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); | ||||
| 	} | ||||
| 
 | ||||
| 	printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n", | ||||
| 			&conn->c_laddr, | ||||
| 			RDS_PROTOCOL_MAJOR(conn->c_version), | ||||
| 			RDS_PROTOCOL_MINOR(conn->c_version), | ||||
| 			ic->i_flowctl ? ", flow control" : ""); | ||||
| 
 | ||||
| 	/* Tune RNR behavior */ | ||||
| 	rds_ib_tune_rnr(ic, &qp_attr); | ||||
| 
 | ||||
| 	qp_attr.qp_state = IB_QPS_RTS; | ||||
| 	err = ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE); | ||||
| 	if (err) | ||||
| 		printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err); | ||||
| 
 | ||||
| 	/* update ib_device with this local ipaddr & conn */ | ||||
| 	rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); | ||||
| 	err = rds_ib_update_ipaddr(rds_ibdev, conn->c_laddr); | ||||
| 	if (err) | ||||
| 		printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", err); | ||||
| 	err = rds_ib_add_conn(rds_ibdev, conn); | ||||
| 	if (err) | ||||
| 		printk(KERN_ERR "rds_ib_add_conn failed (%d)\n", err); | ||||
| 
 | ||||
| 	/* If the peer gave us the last packet it saw, process this as if
 | ||||
| 	 * we had received a regular ACK. */ | ||||
| 	if (dp && dp->dp_ack_seq) | ||||
| 		rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); | ||||
| 
 | ||||
| 	rds_connect_complete(conn); | ||||
| } | ||||
| 
 | ||||
| static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, | ||||
| 			struct rdma_conn_param *conn_param, | ||||
| 			struct rds_ib_connect_private *dp, | ||||
| 			u32 protocol_version) | ||||
| { | ||||
| 	memset(conn_param, 0, sizeof(struct rdma_conn_param)); | ||||
| 	/* XXX tune these? */ | ||||
| 	conn_param->responder_resources = 1; | ||||
| 	conn_param->initiator_depth = 1; | ||||
| 	conn_param->retry_count = 7; | ||||
| 	conn_param->rnr_retry_count = 7; | ||||
| 
 | ||||
| 	if (dp) { | ||||
| 		struct rds_ib_connection *ic = conn->c_transport_data; | ||||
| 
 | ||||
| 		memset(dp, 0, sizeof(*dp)); | ||||
| 		dp->dp_saddr = conn->c_laddr; | ||||
| 		dp->dp_daddr = conn->c_faddr; | ||||
| 		dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version); | ||||
| 		dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version); | ||||
| 		dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS); | ||||
| 		dp->dp_ack_seq = rds_ib_piggyb_ack(ic); | ||||
| 
 | ||||
| 		/* Advertise flow control */ | ||||
| 		if (ic->i_flowctl) { | ||||
| 			unsigned int credits; | ||||
| 
 | ||||
| 			credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)); | ||||
| 			dp->dp_credit = cpu_to_be32(credits); | ||||
| 			atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits); | ||||
| 		} | ||||
| 
 | ||||
| 		conn_param->private_data = dp; | ||||
| 		conn_param->private_data_len = sizeof(*dp); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| static void rds_ib_cq_event_handler(struct ib_event *event, void *data) | ||||
| { | ||||
| 	rdsdebug("event %u data %p\n", event->event, data); | ||||
| } | ||||
| 
 | ||||
| static void rds_ib_qp_event_handler(struct ib_event *event, void *data) | ||||
| { | ||||
| 	struct rds_connection *conn = data; | ||||
| 	struct rds_ib_connection *ic = conn->c_transport_data; | ||||
| 
 | ||||
| 	rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event); | ||||
| 
 | ||||
| 	switch (event->event) { | ||||
| 	case IB_EVENT_COMM_EST: | ||||
| 		rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST); | ||||
| 		break; | ||||
| 	default: | ||||
| 		printk(KERN_WARNING "RDS/ib: unhandled QP event %u " | ||||
| 		       "on connection to %pI4\n", event->event, | ||||
| 		       &conn->c_faddr); | ||||
| 		break; | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * This needs to be very careful to not leave IS_ERR pointers around for | ||||
|  * cleanup to trip over. | ||||
|  */ | ||||
| static int rds_ib_setup_qp(struct rds_connection *conn) | ||||
| { | ||||
| 	struct rds_ib_connection *ic = conn->c_transport_data; | ||||
| 	struct ib_device *dev = ic->i_cm_id->device; | ||||
| 	struct ib_qp_init_attr attr; | ||||
| 	struct rds_ib_device *rds_ibdev; | ||||
| 	int ret; | ||||
| 
 | ||||
| 	/* rds_ib_add_one creates a rds_ib_device object per IB device,
 | ||||
| 	 * and allocates a protection domain, memory range and FMR pool | ||||
| 	 * for each.  If that fails for any reason, it will not register | ||||
| 	 * the rds_ibdev at all. | ||||
| 	 */ | ||||
| 	rds_ibdev = ib_get_client_data(dev, &rds_ib_client); | ||||
| 	if (rds_ibdev == NULL) { | ||||
| 		if (printk_ratelimit()) | ||||
| 			printk(KERN_NOTICE "RDS/IB: No client_data for device %s\n", | ||||
| 					dev->name); | ||||
| 		return -EOPNOTSUPP; | ||||
| 	} | ||||
| 
 | ||||
| 	if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1) | ||||
| 		rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1); | ||||
| 	if (rds_ibdev->max_wrs < ic->i_recv_ring.w_nr + 1) | ||||
| 		rds_ib_ring_resize(&ic->i_recv_ring, rds_ibdev->max_wrs - 1); | ||||
| 
 | ||||
| 	/* Protection domain and memory range */ | ||||
| 	ic->i_pd = rds_ibdev->pd; | ||||
| 	ic->i_mr = rds_ibdev->mr; | ||||
| 
 | ||||
| 	ic->i_send_cq = ib_create_cq(dev, rds_ib_send_cq_comp_handler, | ||||
| 				     rds_ib_cq_event_handler, conn, | ||||
| 				     ic->i_send_ring.w_nr + 1, 0); | ||||
| 	if (IS_ERR(ic->i_send_cq)) { | ||||
| 		ret = PTR_ERR(ic->i_send_cq); | ||||
| 		ic->i_send_cq = NULL; | ||||
| 		rdsdebug("ib_create_cq send failed: %d\n", ret); | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	ic->i_recv_cq = ib_create_cq(dev, rds_ib_recv_cq_comp_handler, | ||||
| 				     rds_ib_cq_event_handler, conn, | ||||
| 				     ic->i_recv_ring.w_nr, 0); | ||||
| 	if (IS_ERR(ic->i_recv_cq)) { | ||||
| 		ret = PTR_ERR(ic->i_recv_cq); | ||||
| 		ic->i_recv_cq = NULL; | ||||
| 		rdsdebug("ib_create_cq recv failed: %d\n", ret); | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP); | ||||
| 	if (ret) { | ||||
| 		rdsdebug("ib_req_notify_cq send failed: %d\n", ret); | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); | ||||
| 	if (ret) { | ||||
| 		rdsdebug("ib_req_notify_cq recv failed: %d\n", ret); | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	/* XXX negotiate max send/recv with remote? */ | ||||
| 	memset(&attr, 0, sizeof(attr)); | ||||
| 	attr.event_handler = rds_ib_qp_event_handler; | ||||
| 	attr.qp_context = conn; | ||||
| 	/* + 1 to allow for the single ack message */ | ||||
| 	attr.cap.max_send_wr = ic->i_send_ring.w_nr + 1; | ||||
| 	attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1; | ||||
| 	attr.cap.max_send_sge = rds_ibdev->max_sge; | ||||
| 	attr.cap.max_recv_sge = RDS_IB_RECV_SGE; | ||||
| 	attr.sq_sig_type = IB_SIGNAL_REQ_WR; | ||||
| 	attr.qp_type = IB_QPT_RC; | ||||
| 	attr.send_cq = ic->i_send_cq; | ||||
| 	attr.recv_cq = ic->i_recv_cq; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * XXX this can fail if max_*_wr is too large?  Are we supposed | ||||
| 	 * to back off until we get a value that the hardware can support? | ||||
| 	 */ | ||||
| 	ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr); | ||||
| 	if (ret) { | ||||
| 		rdsdebug("rdma_create_qp failed: %d\n", ret); | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	ic->i_send_hdrs = ib_dma_alloc_coherent(dev, | ||||
| 					   ic->i_send_ring.w_nr * | ||||
| 						sizeof(struct rds_header), | ||||
| 					   &ic->i_send_hdrs_dma, GFP_KERNEL); | ||||
| 	if (ic->i_send_hdrs == NULL) { | ||||
| 		ret = -ENOMEM; | ||||
| 		rdsdebug("ib_dma_alloc_coherent send failed\n"); | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	ic->i_recv_hdrs = ib_dma_alloc_coherent(dev, | ||||
| 					   ic->i_recv_ring.w_nr * | ||||
| 						sizeof(struct rds_header), | ||||
| 					   &ic->i_recv_hdrs_dma, GFP_KERNEL); | ||||
| 	if (ic->i_recv_hdrs == NULL) { | ||||
| 		ret = -ENOMEM; | ||||
| 		rdsdebug("ib_dma_alloc_coherent recv failed\n"); | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), | ||||
| 				       &ic->i_ack_dma, GFP_KERNEL); | ||||
| 	if (ic->i_ack == NULL) { | ||||
| 		ret = -ENOMEM; | ||||
| 		rdsdebug("ib_dma_alloc_coherent ack failed\n"); | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work)); | ||||
| 	if (ic->i_sends == NULL) { | ||||
| 		ret = -ENOMEM; | ||||
| 		rdsdebug("send allocation failed\n"); | ||||
| 		goto out; | ||||
| 	} | ||||
| 	rds_ib_send_init_ring(ic); | ||||
| 
 | ||||
| 	ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work)); | ||||
| 	if (ic->i_recvs == NULL) { | ||||
| 		ret = -ENOMEM; | ||||
| 		rdsdebug("recv allocation failed\n"); | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	rds_ib_recv_init_ring(ic); | ||||
| 	rds_ib_recv_init_ack(ic); | ||||
| 
 | ||||
| 	/* Post receive buffers - as a side effect, this will update
 | ||||
| 	 * the posted credit count. */ | ||||
| 	rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1); | ||||
| 
 | ||||
| 	rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr, | ||||
| 		 ic->i_send_cq, ic->i_recv_cq); | ||||
| 
 | ||||
| out: | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| static u32 rds_ib_protocol_compatible(const struct rds_ib_connect_private *dp) | ||||
| { | ||||
| 	u16 common; | ||||
| 	u32 version = 0; | ||||
| 
 | ||||
| 	/* rdma_cm private data is odd - when there is any private data in the
 | ||||
| 	 * request, we will be given a pretty large buffer without telling us the | ||||
| 	 * original size. The only way to tell the difference is by looking at | ||||
| 	 * the contents, which are initialized to zero. | ||||
| 	 * If the protocol version fields aren't set, this is a connection attempt | ||||
| 	 * from an older version. This could could be 3.0 or 2.0 - we can't tell. | ||||
| 	 * We really should have changed this for OFED 1.3 :-( */ | ||||
| 	if (dp->dp_protocol_major == 0) | ||||
| 		return RDS_PROTOCOL_3_0; | ||||
| 
 | ||||
| 	common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IB_SUPPORTED_PROTOCOLS; | ||||
| 	if (dp->dp_protocol_major == 3 && common) { | ||||
| 		version = RDS_PROTOCOL_3_0; | ||||
| 		while ((common >>= 1) != 0) | ||||
| 			version++; | ||||
| 	} else if (printk_ratelimit()) { | ||||
| 		printk(KERN_NOTICE "RDS: Connection from %pI4 using " | ||||
| 			"incompatible protocol version %u.%u\n", | ||||
| 			&dp->dp_saddr, | ||||
| 			dp->dp_protocol_major, | ||||
| 			dp->dp_protocol_minor); | ||||
| 	} | ||||
| 	return version; | ||||
| } | ||||
| 
 | ||||
| int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, | ||||
| 				    struct rdma_cm_event *event) | ||||
| { | ||||
| 	__be64 lguid = cm_id->route.path_rec->sgid.global.interface_id; | ||||
| 	__be64 fguid = cm_id->route.path_rec->dgid.global.interface_id; | ||||
| 	const struct rds_ib_connect_private *dp = event->param.conn.private_data; | ||||
| 	struct rds_ib_connect_private dp_rep; | ||||
| 	struct rds_connection *conn = NULL; | ||||
| 	struct rds_ib_connection *ic = NULL; | ||||
| 	struct rdma_conn_param conn_param; | ||||
| 	u32 version; | ||||
| 	int err, destroy = 1; | ||||
| 
 | ||||
| 	/* Check whether the remote protocol version matches ours. */ | ||||
| 	version = rds_ib_protocol_compatible(dp); | ||||
| 	if (!version) | ||||
| 		goto out; | ||||
| 
 | ||||
| 	rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u lguid 0x%llx fguid " | ||||
| 		 "0x%llx\n", &dp->dp_saddr, &dp->dp_daddr, | ||||
| 		 RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version), | ||||
| 		 (unsigned long long)be64_to_cpu(lguid), | ||||
| 		 (unsigned long long)be64_to_cpu(fguid)); | ||||
| 
 | ||||
| 	conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_ib_transport, | ||||
| 			       GFP_KERNEL); | ||||
| 	if (IS_ERR(conn)) { | ||||
| 		rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn)); | ||||
| 		conn = NULL; | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * The connection request may occur while the | ||||
| 	 * previous connection exist, e.g. in case of failover. | ||||
| 	 * But as connections may be initiated simultaneously | ||||
| 	 * by both hosts, we have a random backoff mechanism - | ||||
| 	 * see the comment above rds_queue_reconnect() | ||||
| 	 */ | ||||
| 	mutex_lock(&conn->c_cm_lock); | ||||
| 	if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) { | ||||
| 		if (rds_conn_state(conn) == RDS_CONN_UP) { | ||||
| 			rdsdebug("incoming connect while connecting\n"); | ||||
| 			rds_conn_drop(conn); | ||||
| 			rds_ib_stats_inc(s_ib_listen_closed_stale); | ||||
| 		} else | ||||
| 		if (rds_conn_state(conn) == RDS_CONN_CONNECTING) { | ||||
| 			/* Wait and see - our connect may still be succeeding */ | ||||
| 			rds_ib_stats_inc(s_ib_connect_raced); | ||||
| 		} | ||||
| 		mutex_unlock(&conn->c_cm_lock); | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	ic = conn->c_transport_data; | ||||
| 
 | ||||
| 	rds_ib_set_protocol(conn, version); | ||||
| 	rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); | ||||
| 
 | ||||
| 	/* If the peer gave us the last packet it saw, process this as if
 | ||||
| 	 * we had received a regular ACK. */ | ||||
| 	if (dp->dp_ack_seq) | ||||
| 		rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); | ||||
| 
 | ||||
| 	BUG_ON(cm_id->context); | ||||
| 	BUG_ON(ic->i_cm_id); | ||||
| 
 | ||||
| 	ic->i_cm_id = cm_id; | ||||
| 	cm_id->context = conn; | ||||
| 
 | ||||
| 	/* We got halfway through setting up the ib_connection, if we
 | ||||
| 	 * fail now, we have to take the long route out of this mess. */ | ||||
| 	destroy = 0; | ||||
| 
 | ||||
| 	err = rds_ib_setup_qp(conn); | ||||
| 	if (err) { | ||||
| 		rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", err); | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version); | ||||
| 
 | ||||
| 	/* rdma_accept() calls rdma_reject() internally if it fails */ | ||||
| 	err = rdma_accept(cm_id, &conn_param); | ||||
| 	mutex_unlock(&conn->c_cm_lock); | ||||
| 	if (err) { | ||||
| 		rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err); | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	return 0; | ||||
| 
 | ||||
| out: | ||||
| 	rdma_reject(cm_id, NULL, 0); | ||||
| 	return destroy; | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id) | ||||
| { | ||||
| 	struct rds_connection *conn = cm_id->context; | ||||
| 	struct rds_ib_connection *ic = conn->c_transport_data; | ||||
| 	struct rdma_conn_param conn_param; | ||||
| 	struct rds_ib_connect_private dp; | ||||
| 	int ret; | ||||
| 
 | ||||
| 	/* If the peer doesn't do protocol negotiation, we must
 | ||||
| 	 * default to RDSv3.0 */ | ||||
| 	rds_ib_set_protocol(conn, RDS_PROTOCOL_3_0); | ||||
| 	ic->i_flowctl = rds_ib_sysctl_flow_control;	/* advertise flow control */ | ||||
| 
 | ||||
| 	ret = rds_ib_setup_qp(conn); | ||||
| 	if (ret) { | ||||
| 		rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", ret); | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION); | ||||
| 
 | ||||
| 	ret = rdma_connect(cm_id, &conn_param); | ||||
| 	if (ret) | ||||
| 		rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret); | ||||
| 
 | ||||
| out: | ||||
| 	/* Beware - returning non-zero tells the rdma_cm to destroy
 | ||||
| 	 * the cm_id. We should certainly not do it as long as we still | ||||
| 	 * "own" the cm_id. */ | ||||
| 	if (ret) { | ||||
| 		if (ic->i_cm_id == cm_id) | ||||
| 			ret = 0; | ||||
| 	} | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| int rds_ib_conn_connect(struct rds_connection *conn) | ||||
| { | ||||
| 	struct rds_ib_connection *ic = conn->c_transport_data; | ||||
| 	struct sockaddr_in src, dest; | ||||
| 	int ret; | ||||
| 
 | ||||
| 	/* XXX I wonder what affect the port space has */ | ||||
| 	/* delegate cm event handler to rdma_transport */ | ||||
| 	ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn, | ||||
| 				     RDMA_PS_TCP); | ||||
| 	if (IS_ERR(ic->i_cm_id)) { | ||||
| 		ret = PTR_ERR(ic->i_cm_id); | ||||
| 		ic->i_cm_id = NULL; | ||||
| 		rdsdebug("rdma_create_id() failed: %d\n", ret); | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn); | ||||
| 
 | ||||
| 	src.sin_family = AF_INET; | ||||
| 	src.sin_addr.s_addr = (__force u32)conn->c_laddr; | ||||
| 	src.sin_port = (__force u16)htons(0); | ||||
| 
 | ||||
| 	dest.sin_family = AF_INET; | ||||
| 	dest.sin_addr.s_addr = (__force u32)conn->c_faddr; | ||||
| 	dest.sin_port = (__force u16)htons(RDS_PORT); | ||||
| 
 | ||||
| 	ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src, | ||||
| 				(struct sockaddr *)&dest, | ||||
| 				RDS_RDMA_RESOLVE_TIMEOUT_MS); | ||||
| 	if (ret) { | ||||
| 		rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id, | ||||
| 			 ret); | ||||
| 		rdma_destroy_id(ic->i_cm_id); | ||||
| 		ic->i_cm_id = NULL; | ||||
| 	} | ||||
| 
 | ||||
| out: | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * This is so careful about only cleaning up resources that were built up | ||||
|  * so that it can be called at any point during startup.  In fact it | ||||
|  * can be called multiple times for a given connection. | ||||
|  */ | ||||
| void rds_ib_conn_shutdown(struct rds_connection *conn) | ||||
| { | ||||
| 	struct rds_ib_connection *ic = conn->c_transport_data; | ||||
| 	int err = 0; | ||||
| 
 | ||||
| 	rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id, | ||||
| 		 ic->i_pd, ic->i_send_cq, ic->i_recv_cq, | ||||
| 		 ic->i_cm_id ? ic->i_cm_id->qp : NULL); | ||||
| 
 | ||||
| 	if (ic->i_cm_id) { | ||||
| 		struct ib_device *dev = ic->i_cm_id->device; | ||||
| 
 | ||||
| 		rdsdebug("disconnecting cm %p\n", ic->i_cm_id); | ||||
| 		err = rdma_disconnect(ic->i_cm_id); | ||||
| 		if (err) { | ||||
| 			/* Actually this may happen quite frequently, when
 | ||||
| 			 * an outgoing connect raced with an incoming connect. | ||||
| 			 */ | ||||
| 			rdsdebug("failed to disconnect, cm: %p err %d\n", | ||||
| 				ic->i_cm_id, err); | ||||
| 		} | ||||
| 
 | ||||
| 		wait_event(rds_ib_ring_empty_wait, | ||||
| 			rds_ib_ring_empty(&ic->i_send_ring) && | ||||
| 			rds_ib_ring_empty(&ic->i_recv_ring)); | ||||
| 
 | ||||
| 		if (ic->i_send_hdrs) | ||||
| 			ib_dma_free_coherent(dev, | ||||
| 					   ic->i_send_ring.w_nr * | ||||
| 						sizeof(struct rds_header), | ||||
| 					   ic->i_send_hdrs, | ||||
| 					   ic->i_send_hdrs_dma); | ||||
| 
 | ||||
| 		if (ic->i_recv_hdrs) | ||||
| 			ib_dma_free_coherent(dev, | ||||
| 					   ic->i_recv_ring.w_nr * | ||||
| 						sizeof(struct rds_header), | ||||
| 					   ic->i_recv_hdrs, | ||||
| 					   ic->i_recv_hdrs_dma); | ||||
| 
 | ||||
| 		if (ic->i_ack) | ||||
| 			ib_dma_free_coherent(dev, sizeof(struct rds_header), | ||||
| 					     ic->i_ack, ic->i_ack_dma); | ||||
| 
 | ||||
| 		if (ic->i_sends) | ||||
| 			rds_ib_send_clear_ring(ic); | ||||
| 		if (ic->i_recvs) | ||||
| 			rds_ib_recv_clear_ring(ic); | ||||
| 
 | ||||
| 		if (ic->i_cm_id->qp) | ||||
| 			rdma_destroy_qp(ic->i_cm_id); | ||||
| 		if (ic->i_send_cq) | ||||
| 			ib_destroy_cq(ic->i_send_cq); | ||||
| 		if (ic->i_recv_cq) | ||||
| 			ib_destroy_cq(ic->i_recv_cq); | ||||
| 		rdma_destroy_id(ic->i_cm_id); | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * Move connection back to the nodev list. | ||||
| 		 */ | ||||
| 		if (ic->rds_ibdev) { | ||||
| 
 | ||||
| 			spin_lock_irq(&ic->rds_ibdev->spinlock); | ||||
| 			BUG_ON(list_empty(&ic->ib_node)); | ||||
| 			list_del(&ic->ib_node); | ||||
| 			spin_unlock_irq(&ic->rds_ibdev->spinlock); | ||||
| 
 | ||||
| 			spin_lock_irq(&ib_nodev_conns_lock); | ||||
| 			list_add_tail(&ic->ib_node, &ib_nodev_conns); | ||||
| 			spin_unlock_irq(&ib_nodev_conns_lock); | ||||
| 			ic->rds_ibdev = NULL; | ||||
| 		} | ||||
| 
 | ||||
| 		ic->i_cm_id = NULL; | ||||
| 		ic->i_pd = NULL; | ||||
| 		ic->i_mr = NULL; | ||||
| 		ic->i_send_cq = NULL; | ||||
| 		ic->i_recv_cq = NULL; | ||||
| 		ic->i_send_hdrs = NULL; | ||||
| 		ic->i_recv_hdrs = NULL; | ||||
| 		ic->i_ack = NULL; | ||||
| 	} | ||||
| 	BUG_ON(ic->rds_ibdev); | ||||
| 
 | ||||
| 	/* Clear pending transmit */ | ||||
| 	if (ic->i_rm) { | ||||
| 		rds_message_put(ic->i_rm); | ||||
| 		ic->i_rm = NULL; | ||||
| 	} | ||||
| 
 | ||||
| 	/* Clear the ACK state */ | ||||
| 	clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); | ||||
| 	rds_ib_set_64bit(&ic->i_ack_next, 0); | ||||
| 	ic->i_ack_recv = 0; | ||||
| 
 | ||||
| 	/* Clear flow control state */ | ||||
| 	ic->i_flowctl = 0; | ||||
| 	atomic_set(&ic->i_credits, 0); | ||||
| 
 | ||||
| 	rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr); | ||||
| 	rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr); | ||||
| 
 | ||||
| 	if (ic->i_ibinc) { | ||||
| 		rds_inc_put(&ic->i_ibinc->ii_inc); | ||||
| 		ic->i_ibinc = NULL; | ||||
| 	} | ||||
| 
 | ||||
| 	vfree(ic->i_sends); | ||||
| 	ic->i_sends = NULL; | ||||
| 	vfree(ic->i_recvs); | ||||
| 	ic->i_recvs = NULL; | ||||
| } | ||||
| 
 | ||||
| int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) | ||||
| { | ||||
| 	struct rds_ib_connection *ic; | ||||
| 	unsigned long flags; | ||||
| 
 | ||||
| 	/* XXX too lazy? */ | ||||
| 	ic = kzalloc(sizeof(struct rds_ib_connection), GFP_KERNEL); | ||||
| 	if (ic == NULL) | ||||
| 		return -ENOMEM; | ||||
| 
 | ||||
| 	INIT_LIST_HEAD(&ic->ib_node); | ||||
| 	mutex_init(&ic->i_recv_mutex); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * rds_ib_conn_shutdown() waits for these to be emptied so they | ||||
| 	 * must be initialized before it can be called. | ||||
| 	 */ | ||||
| 	rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr); | ||||
| 	rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr); | ||||
| 
 | ||||
| 	ic->conn = conn; | ||||
| 	conn->c_transport_data = ic; | ||||
| 
 | ||||
| 	spin_lock_irqsave(&ib_nodev_conns_lock, flags); | ||||
| 	list_add_tail(&ic->ib_node, &ib_nodev_conns); | ||||
| 	spin_unlock_irqrestore(&ib_nodev_conns_lock, flags); | ||||
| 
 | ||||
| 
 | ||||
| 	rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data); | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| void rds_ib_conn_free(void *arg) | ||||
| { | ||||
| 	struct rds_ib_connection *ic = arg; | ||||
| 	rdsdebug("ic %p\n", ic); | ||||
| 	list_del(&ic->ib_node); | ||||
| 	kfree(ic); | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| /*
 | ||||
|  * An error occurred on the connection | ||||
|  */ | ||||
| void | ||||
| __rds_ib_conn_error(struct rds_connection *conn, const char *fmt, ...) | ||||
| { | ||||
| 	va_list ap; | ||||
| 
 | ||||
| 	rds_conn_drop(conn); | ||||
| 
 | ||||
| 	va_start(ap, fmt); | ||||
| 	vprintk(fmt, ap); | ||||
| 	va_end(ap); | ||||
| } | ||||
		Loading…
	
		Reference in a new issue
	
	 Andy Grover
						Andy Grover