forked from mirrors/linux
		
	net/rds: Handle ODP mr registration/unregistration
On-Demand-Paging MRs are registered using ib_reg_user_mr and unregistered with ib_dereg_mr. Signed-off-by: Hans Westgaard Ry <hans.westgaard.ry@oracle.com> Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
This commit is contained in:
		
							parent
							
								
									c4c86abb3f
								
							
						
					
					
						commit
						2eafa1746f
					
				
					 7 changed files with 243 additions and 55 deletions
				
			
		|  | @ -156,6 +156,13 @@ static void rds_ib_add_one(struct ib_device *device) | ||||||
| 	has_fmr = (device->ops.alloc_fmr && device->ops.dealloc_fmr && | 	has_fmr = (device->ops.alloc_fmr && device->ops.dealloc_fmr && | ||||||
| 		   device->ops.map_phys_fmr && device->ops.unmap_fmr); | 		   device->ops.map_phys_fmr && device->ops.unmap_fmr); | ||||||
| 	rds_ibdev->use_fastreg = (has_fr && !has_fmr); | 	rds_ibdev->use_fastreg = (has_fr && !has_fmr); | ||||||
|  | 	rds_ibdev->odp_capable = | ||||||
|  | 		!!(device->attrs.device_cap_flags & | ||||||
|  | 		   IB_DEVICE_ON_DEMAND_PAGING) && | ||||||
|  | 		!!(device->attrs.odp_caps.per_transport_caps.rc_odp_caps & | ||||||
|  | 		   IB_ODP_SUPPORT_WRITE) && | ||||||
|  | 		!!(device->attrs.odp_caps.per_transport_caps.rc_odp_caps & | ||||||
|  | 		   IB_ODP_SUPPORT_READ); | ||||||
| 
 | 
 | ||||||
| 	rds_ibdev->fmr_max_remaps = device->attrs.max_map_per_fmr?: 32; | 	rds_ibdev->fmr_max_remaps = device->attrs.max_map_per_fmr?: 32; | ||||||
| 	rds_ibdev->max_1m_mrs = device->attrs.max_mr ? | 	rds_ibdev->max_1m_mrs = device->attrs.max_mr ? | ||||||
|  |  | ||||||
|  | @ -247,7 +247,8 @@ struct rds_ib_device { | ||||||
| 	struct ib_device	*dev; | 	struct ib_device	*dev; | ||||||
| 	struct ib_pd		*pd; | 	struct ib_pd		*pd; | ||||||
| 	struct dma_pool		*rid_hdrs_pool; /* RDS headers DMA pool */ | 	struct dma_pool		*rid_hdrs_pool; /* RDS headers DMA pool */ | ||||||
| 	bool                    use_fastreg; | 	u8			use_fastreg:1; | ||||||
|  | 	u8			odp_capable:1; | ||||||
| 
 | 
 | ||||||
| 	unsigned int		max_mrs; | 	unsigned int		max_mrs; | ||||||
| 	struct rds_ib_mr_pool	*mr_1m_pool; | 	struct rds_ib_mr_pool	*mr_1m_pool; | ||||||
|  |  | ||||||
|  | @ -67,6 +67,7 @@ struct rds_ib_frmr { | ||||||
| 
 | 
 | ||||||
| /* This is stored as mr->r_trans_private. */ | /* This is stored as mr->r_trans_private. */ | ||||||
| struct rds_ib_mr { | struct rds_ib_mr { | ||||||
|  | 	struct delayed_work		work; | ||||||
| 	struct rds_ib_device		*device; | 	struct rds_ib_device		*device; | ||||||
| 	struct rds_ib_mr_pool		*pool; | 	struct rds_ib_mr_pool		*pool; | ||||||
| 	struct rds_ib_connection	*ic; | 	struct rds_ib_connection	*ic; | ||||||
|  | @ -81,9 +82,11 @@ struct rds_ib_mr { | ||||||
| 	unsigned int			sg_len; | 	unsigned int			sg_len; | ||||||
| 	int				sg_dma_len; | 	int				sg_dma_len; | ||||||
| 
 | 
 | ||||||
|  | 	u8				odp:1; | ||||||
| 	union { | 	union { | ||||||
| 		struct rds_ib_fmr	fmr; | 		struct rds_ib_fmr	fmr; | ||||||
| 		struct rds_ib_frmr	frmr; | 		struct rds_ib_frmr	frmr; | ||||||
|  | 		struct ib_mr		*mr; | ||||||
| 	} u; | 	} u; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
|  | @ -122,12 +125,14 @@ void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev, | ||||||
| void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); | void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); | ||||||
| void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, | void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, | ||||||
| 		    struct rds_sock *rs, u32 *key_ret, | 		    struct rds_sock *rs, u32 *key_ret, | ||||||
| 		    struct rds_connection *conn); | 		    struct rds_connection *conn, u64 start, u64 length, | ||||||
|  | 		    int need_odp); | ||||||
| void rds_ib_sync_mr(void *trans_private, int dir); | void rds_ib_sync_mr(void *trans_private, int dir); | ||||||
| void rds_ib_free_mr(void *trans_private, int invalidate); | void rds_ib_free_mr(void *trans_private, int invalidate); | ||||||
| void rds_ib_flush_mrs(void); | void rds_ib_flush_mrs(void); | ||||||
| int rds_ib_mr_init(void); | int rds_ib_mr_init(void); | ||||||
| void rds_ib_mr_exit(void); | void rds_ib_mr_exit(void); | ||||||
|  | u32 rds_ib_get_lkey(void *trans_private); | ||||||
| 
 | 
 | ||||||
| void __rds_ib_teardown_mr(struct rds_ib_mr *); | void __rds_ib_teardown_mr(struct rds_ib_mr *); | ||||||
| void rds_ib_teardown_mr(struct rds_ib_mr *); | void rds_ib_teardown_mr(struct rds_ib_mr *); | ||||||
|  |  | ||||||
|  | @ -37,8 +37,15 @@ | ||||||
| 
 | 
 | ||||||
| #include "rds_single_path.h" | #include "rds_single_path.h" | ||||||
| #include "ib_mr.h" | #include "ib_mr.h" | ||||||
|  | #include "rds.h" | ||||||
| 
 | 
 | ||||||
| struct workqueue_struct *rds_ib_mr_wq; | struct workqueue_struct *rds_ib_mr_wq; | ||||||
|  | struct rds_ib_dereg_odp_mr { | ||||||
|  | 	struct work_struct work; | ||||||
|  | 	struct ib_mr *mr; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | static void rds_ib_odp_mr_worker(struct work_struct *work); | ||||||
| 
 | 
 | ||||||
| static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr) | static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr) | ||||||
| { | { | ||||||
|  | @ -213,6 +220,9 @@ void rds_ib_sync_mr(void *trans_private, int direction) | ||||||
| 	struct rds_ib_mr *ibmr = trans_private; | 	struct rds_ib_mr *ibmr = trans_private; | ||||||
| 	struct rds_ib_device *rds_ibdev = ibmr->device; | 	struct rds_ib_device *rds_ibdev = ibmr->device; | ||||||
| 
 | 
 | ||||||
|  | 	if (ibmr->odp) | ||||||
|  | 		return; | ||||||
|  | 
 | ||||||
| 	switch (direction) { | 	switch (direction) { | ||||||
| 	case DMA_FROM_DEVICE: | 	case DMA_FROM_DEVICE: | ||||||
| 		ib_dma_sync_sg_for_cpu(rds_ibdev->dev, ibmr->sg, | 		ib_dma_sync_sg_for_cpu(rds_ibdev->dev, ibmr->sg, | ||||||
|  | @ -482,6 +492,16 @@ void rds_ib_free_mr(void *trans_private, int invalidate) | ||||||
| 
 | 
 | ||||||
| 	rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len); | 	rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len); | ||||||
| 
 | 
 | ||||||
|  | 	if (ibmr->odp) { | ||||||
|  | 		/* A MR created and marked as use_once. We use delayed work,
 | ||||||
|  | 		 * because there is a change that we are in interrupt and can't | ||||||
|  | 		 * call to ib_dereg_mr() directly. | ||||||
|  | 		 */ | ||||||
|  | 		INIT_DELAYED_WORK(&ibmr->work, rds_ib_odp_mr_worker); | ||||||
|  | 		queue_delayed_work(rds_ib_mr_wq, &ibmr->work, 0); | ||||||
|  | 		return; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
| 	/* Return it to the pool's free list */ | 	/* Return it to the pool's free list */ | ||||||
| 	if (rds_ibdev->use_fastreg) | 	if (rds_ibdev->use_fastreg) | ||||||
| 		rds_ib_free_frmr_list(ibmr); | 		rds_ib_free_frmr_list(ibmr); | ||||||
|  | @ -526,9 +546,17 @@ void rds_ib_flush_mrs(void) | ||||||
| 	up_read(&rds_ib_devices_lock); | 	up_read(&rds_ib_devices_lock); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | u32 rds_ib_get_lkey(void *trans_private) | ||||||
|  | { | ||||||
|  | 	struct rds_ib_mr *ibmr = trans_private; | ||||||
|  | 
 | ||||||
|  | 	return ibmr->u.mr->lkey; | ||||||
|  | } | ||||||
|  | 
 | ||||||
| void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, | void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, | ||||||
| 		    struct rds_sock *rs, u32 *key_ret, | 		    struct rds_sock *rs, u32 *key_ret, | ||||||
| 		    struct rds_connection *conn) | 		    struct rds_connection *conn, | ||||||
|  | 		    u64 start, u64 length, int need_odp) | ||||||
| { | { | ||||||
| 	struct rds_ib_device *rds_ibdev; | 	struct rds_ib_device *rds_ibdev; | ||||||
| 	struct rds_ib_mr *ibmr = NULL; | 	struct rds_ib_mr *ibmr = NULL; | ||||||
|  | @ -541,6 +569,42 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, | ||||||
| 		goto out; | 		goto out; | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | 	if (need_odp == ODP_ZEROBASED || need_odp == ODP_VIRTUAL) { | ||||||
|  | 		u64 virt_addr = need_odp == ODP_ZEROBASED ? 0 : start; | ||||||
|  | 		int access_flags = | ||||||
|  | 			(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ | | ||||||
|  | 			 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_ATOMIC | | ||||||
|  | 			 IB_ACCESS_ON_DEMAND); | ||||||
|  | 		struct ib_mr *ib_mr; | ||||||
|  | 
 | ||||||
|  | 		if (!rds_ibdev->odp_capable) { | ||||||
|  | 			ret = -EOPNOTSUPP; | ||||||
|  | 			goto out; | ||||||
|  | 		} | ||||||
|  | 
 | ||||||
|  | 		ib_mr = ib_reg_user_mr(rds_ibdev->pd, start, length, virt_addr, | ||||||
|  | 				       access_flags); | ||||||
|  | 
 | ||||||
|  | 		if (IS_ERR(ib_mr)) { | ||||||
|  | 			rdsdebug("rds_ib_get_user_mr returned %d\n", | ||||||
|  | 				 IS_ERR(ib_mr)); | ||||||
|  | 			ret = PTR_ERR(ib_mr); | ||||||
|  | 			goto out; | ||||||
|  | 		} | ||||||
|  | 		if (key_ret) | ||||||
|  | 			*key_ret = ib_mr->rkey; | ||||||
|  | 
 | ||||||
|  | 		ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL); | ||||||
|  | 		if (!ibmr) { | ||||||
|  | 			ib_dereg_mr(ib_mr); | ||||||
|  | 			ret = -ENOMEM; | ||||||
|  | 			goto out; | ||||||
|  | 		} | ||||||
|  | 		ibmr->u.mr = ib_mr; | ||||||
|  | 		ibmr->odp = 1; | ||||||
|  | 		return ibmr; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
| 	if (conn) | 	if (conn) | ||||||
| 		ic = conn->c_transport_data; | 		ic = conn->c_transport_data; | ||||||
| 
 | 
 | ||||||
|  | @ -629,3 +693,12 @@ void rds_ib_mr_exit(void) | ||||||
| { | { | ||||||
| 	destroy_workqueue(rds_ib_mr_wq); | 	destroy_workqueue(rds_ib_mr_wq); | ||||||
| } | } | ||||||
|  | 
 | ||||||
|  | static void rds_ib_odp_mr_worker(struct work_struct  *work) | ||||||
|  | { | ||||||
|  | 	struct rds_ib_mr *ibmr; | ||||||
|  | 
 | ||||||
|  | 	ibmr = container_of(work, struct rds_ib_mr, work.work); | ||||||
|  | 	ib_dereg_mr(ibmr->u.mr); | ||||||
|  | 	kfree(ibmr); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | @ -39,6 +39,7 @@ | ||||||
| #include "rds_single_path.h" | #include "rds_single_path.h" | ||||||
| #include "rds.h" | #include "rds.h" | ||||||
| #include "ib.h" | #include "ib.h" | ||||||
|  | #include "ib_mr.h" | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  * Convert IB-specific error message to RDS error message and call core |  * Convert IB-specific error message to RDS error message and call core | ||||||
|  | @ -635,6 +636,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, | ||||||
| 		send->s_sge[0].addr = ic->i_send_hdrs_dma[pos]; | 		send->s_sge[0].addr = ic->i_send_hdrs_dma[pos]; | ||||||
| 
 | 
 | ||||||
| 		send->s_sge[0].length = sizeof(struct rds_header); | 		send->s_sge[0].length = sizeof(struct rds_header); | ||||||
|  | 		send->s_sge[0].lkey = ic->i_pd->local_dma_lkey; | ||||||
| 
 | 
 | ||||||
| 		memcpy(ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, | 		memcpy(ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, | ||||||
| 		       sizeof(struct rds_header)); | 		       sizeof(struct rds_header)); | ||||||
|  | @ -650,6 +652,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, | ||||||
| 			send->s_sge[1].addr = sg_dma_address(scat); | 			send->s_sge[1].addr = sg_dma_address(scat); | ||||||
| 			send->s_sge[1].addr += rm->data.op_dmaoff; | 			send->s_sge[1].addr += rm->data.op_dmaoff; | ||||||
| 			send->s_sge[1].length = len; | 			send->s_sge[1].length = len; | ||||||
|  | 			send->s_sge[1].lkey = ic->i_pd->local_dma_lkey; | ||||||
| 
 | 
 | ||||||
| 			bytes_sent += len; | 			bytes_sent += len; | ||||||
| 			rm->data.op_dmaoff += len; | 			rm->data.op_dmaoff += len; | ||||||
|  | @ -858,21 +861,30 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) | ||||||
| 	int ret; | 	int ret; | ||||||
| 	int num_sge; | 	int num_sge; | ||||||
| 	int nr_sig = 0; | 	int nr_sig = 0; | ||||||
|  | 	u64 odp_addr = op->op_odp_addr; | ||||||
|  | 	u32 odp_lkey = 0; | ||||||
| 
 | 
 | ||||||
| 	/* map the op the first time we see it */ | 	/* map the op the first time we see it */ | ||||||
|  | 	if (!op->op_odp_mr) { | ||||||
| 		if (!op->op_mapped) { | 		if (!op->op_mapped) { | ||||||
| 		op->op_count = ib_dma_map_sg(ic->i_cm_id->device, | 			op->op_count = | ||||||
| 					     op->op_sg, op->op_nents, (op->op_write) ? | 				ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, | ||||||
| 					     DMA_TO_DEVICE : DMA_FROM_DEVICE); | 					      op->op_nents, | ||||||
| 		rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count); | 					      (op->op_write) ? DMA_TO_DEVICE : | ||||||
|  | 							       DMA_FROM_DEVICE); | ||||||
|  | 			rdsdebug("ic %p mapping op %p: %d\n", ic, op, | ||||||
|  | 				 op->op_count); | ||||||
| 			if (op->op_count == 0) { | 			if (op->op_count == 0) { | ||||||
| 				rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); | 				rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); | ||||||
| 				ret = -ENOMEM; /* XXX ? */ | 				ret = -ENOMEM; /* XXX ? */ | ||||||
| 				goto out; | 				goto out; | ||||||
| 			} | 			} | ||||||
| 
 |  | ||||||
| 			op->op_mapped = 1; | 			op->op_mapped = 1; | ||||||
| 		} | 		} | ||||||
|  | 	} else { | ||||||
|  | 		op->op_count = op->op_nents; | ||||||
|  | 		odp_lkey = rds_ib_get_lkey(op->op_odp_mr->r_trans_private); | ||||||
|  | 	} | ||||||
| 
 | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * Instead of knowing how to return a partial rdma read/write we insist that there | 	 * Instead of knowing how to return a partial rdma read/write we insist that there | ||||||
|  | @ -923,14 +935,20 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) | ||||||
| 		for (j = 0; j < send->s_rdma_wr.wr.num_sge && | 		for (j = 0; j < send->s_rdma_wr.wr.num_sge && | ||||||
| 		     scat != &op->op_sg[op->op_count]; j++) { | 		     scat != &op->op_sg[op->op_count]; j++) { | ||||||
| 			len = sg_dma_len(scat); | 			len = sg_dma_len(scat); | ||||||
|  | 			if (!op->op_odp_mr) { | ||||||
| 				send->s_sge[j].addr = sg_dma_address(scat); | 				send->s_sge[j].addr = sg_dma_address(scat); | ||||||
| 			send->s_sge[j].length = len; |  | ||||||
| 				send->s_sge[j].lkey = ic->i_pd->local_dma_lkey; | 				send->s_sge[j].lkey = ic->i_pd->local_dma_lkey; | ||||||
|  | 			} else { | ||||||
|  | 				send->s_sge[j].addr = odp_addr; | ||||||
|  | 				send->s_sge[j].lkey = odp_lkey; | ||||||
|  | 			} | ||||||
|  | 			send->s_sge[j].length = len; | ||||||
| 
 | 
 | ||||||
| 			sent += len; | 			sent += len; | ||||||
| 			rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr); | 			rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr); | ||||||
| 
 | 
 | ||||||
| 			remote_addr += len; | 			remote_addr += len; | ||||||
|  | 			odp_addr += len; | ||||||
| 			scat++; | 			scat++; | ||||||
| 		} | 		} | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
							
								
								
									
										109
									
								
								net/rds/rdma.c
									
									
									
									
									
								
							
							
						
						
									
										109
									
								
								net/rds/rdma.c
									
									
									
									
									
								
							|  | @ -177,13 +177,14 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, | ||||||
| 			  struct rds_conn_path *cp) | 			  struct rds_conn_path *cp) | ||||||
| { | { | ||||||
| 	struct rds_mr *mr = NULL, *found; | 	struct rds_mr *mr = NULL, *found; | ||||||
|  | 	struct scatterlist *sg = NULL; | ||||||
| 	unsigned int nr_pages; | 	unsigned int nr_pages; | ||||||
| 	struct page **pages = NULL; | 	struct page **pages = NULL; | ||||||
| 	struct scatterlist *sg; |  | ||||||
| 	void *trans_private; | 	void *trans_private; | ||||||
| 	unsigned long flags; | 	unsigned long flags; | ||||||
| 	rds_rdma_cookie_t cookie; | 	rds_rdma_cookie_t cookie; | ||||||
| 	unsigned int nents; | 	unsigned int nents = 0; | ||||||
|  | 	int need_odp = 0; | ||||||
| 	long i; | 	long i; | ||||||
| 	int ret; | 	int ret; | ||||||
| 
 | 
 | ||||||
|  | @ -197,6 +198,21 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, | ||||||
| 		goto out; | 		goto out; | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | 	/* If the combination of the addr and size requested for this memory
 | ||||||
|  | 	 * region causes an integer overflow, return error. | ||||||
|  | 	 */ | ||||||
|  | 	if (((args->vec.addr + args->vec.bytes) < args->vec.addr) || | ||||||
|  | 	    PAGE_ALIGN(args->vec.addr + args->vec.bytes) < | ||||||
|  | 		    (args->vec.addr + args->vec.bytes)) { | ||||||
|  | 		ret = -EINVAL; | ||||||
|  | 		goto out; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	if (!can_do_mlock()) { | ||||||
|  | 		ret = -EPERM; | ||||||
|  | 		goto out; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
| 	nr_pages = rds_pages_in_vec(&args->vec); | 	nr_pages = rds_pages_in_vec(&args->vec); | ||||||
| 	if (nr_pages == 0) { | 	if (nr_pages == 0) { | ||||||
| 		ret = -EINVAL; | 		ret = -EINVAL; | ||||||
|  | @ -250,9 +266,11 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, | ||||||
| 	 * the zero page. | 	 * the zero page. | ||||||
| 	 */ | 	 */ | ||||||
| 	ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1); | 	ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1); | ||||||
| 	if (ret < 0) | 	if (ret == -EOPNOTSUPP) { | ||||||
|  | 		need_odp = 1; | ||||||
|  | 	} else if (ret <= 0) { | ||||||
| 		goto out; | 		goto out; | ||||||
| 
 | 	} else { | ||||||
| 		nents = ret; | 		nents = ret; | ||||||
| 		sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL); | 		sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL); | ||||||
| 		if (!sg) { | 		if (!sg) { | ||||||
|  | @ -267,19 +285,25 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, | ||||||
| 			sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0); | 			sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0); | ||||||
| 
 | 
 | ||||||
| 		rdsdebug("RDS: trans_private nents is %u\n", nents); | 		rdsdebug("RDS: trans_private nents is %u\n", nents); | ||||||
| 
 | 	} | ||||||
| 	/* Obtain a transport specific MR. If this succeeds, the
 | 	/* Obtain a transport specific MR. If this succeeds, the
 | ||||||
| 	 * s/g list is now owned by the MR. | 	 * s/g list is now owned by the MR. | ||||||
| 	 * Note that dma_map() implies that pending writes are | 	 * Note that dma_map() implies that pending writes are | ||||||
| 	 * flushed to RAM, so no dma_sync is needed here. */ | 	 * flushed to RAM, so no dma_sync is needed here. */ | ||||||
| 	trans_private = rs->rs_transport->get_mr(sg, nents, rs, | 	trans_private = rs->rs_transport->get_mr( | ||||||
| 						 &mr->r_key, | 		sg, nents, rs, &mr->r_key, cp ? cp->cp_conn : NULL, | ||||||
| 						 cp ? cp->cp_conn : NULL); | 		args->vec.addr, args->vec.bytes, | ||||||
|  | 		need_odp ? ODP_ZEROBASED : ODP_NOT_NEEDED); | ||||||
| 
 | 
 | ||||||
| 	if (IS_ERR(trans_private)) { | 	if (IS_ERR(trans_private)) { | ||||||
|  | 		/* In ODP case, we don't GUP pages, so don't need
 | ||||||
|  | 		 * to release anything. | ||||||
|  | 		 */ | ||||||
|  | 		if (!need_odp) { | ||||||
| 			for (i = 0 ; i < nents; i++) | 			for (i = 0 ; i < nents; i++) | ||||||
| 				put_page(sg_page(&sg[i])); | 				put_page(sg_page(&sg[i])); | ||||||
| 			kfree(sg); | 			kfree(sg); | ||||||
|  | 		} | ||||||
| 		ret = PTR_ERR(trans_private); | 		ret = PTR_ERR(trans_private); | ||||||
| 		goto out; | 		goto out; | ||||||
| 	} | 	} | ||||||
|  | @ -293,7 +317,11 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, | ||||||
| 	 * map page aligned regions. So we keep the offset, and build | 	 * map page aligned regions. So we keep the offset, and build | ||||||
| 	 * a 64bit cookie containing <R_Key, offset> and pass that | 	 * a 64bit cookie containing <R_Key, offset> and pass that | ||||||
| 	 * around. */ | 	 * around. */ | ||||||
| 	cookie = rds_rdma_make_cookie(mr->r_key, args->vec.addr & ~PAGE_MASK); | 	if (need_odp) | ||||||
|  | 		cookie = rds_rdma_make_cookie(mr->r_key, 0); | ||||||
|  | 	else | ||||||
|  | 		cookie = rds_rdma_make_cookie(mr->r_key, | ||||||
|  | 					      args->vec.addr & ~PAGE_MASK); | ||||||
| 	if (cookie_ret) | 	if (cookie_ret) | ||||||
| 		*cookie_ret = cookie; | 		*cookie_ret = cookie; | ||||||
| 
 | 
 | ||||||
|  | @ -458,22 +486,26 @@ void rds_rdma_free_op(struct rm_rdma_op *ro) | ||||||
| { | { | ||||||
| 	unsigned int i; | 	unsigned int i; | ||||||
| 
 | 
 | ||||||
|  | 	if (ro->op_odp_mr) { | ||||||
|  | 		rds_mr_put(ro->op_odp_mr); | ||||||
|  | 	} else { | ||||||
| 		for (i = 0; i < ro->op_nents; i++) { | 		for (i = 0; i < ro->op_nents; i++) { | ||||||
| 			struct page *page = sg_page(&ro->op_sg[i]); | 			struct page *page = sg_page(&ro->op_sg[i]); | ||||||
| 
 | 
 | ||||||
| 			/* Mark page dirty if it was possibly modified, which
 | 			/* Mark page dirty if it was possibly modified, which
 | ||||||
| 			 * is the case for a RDMA_READ which copies from remote | 			 * is the case for a RDMA_READ which copies from remote | ||||||
| 		 * to local memory */ | 			 * to local memory | ||||||
| 		if (!ro->op_write) { | 			 */ | ||||||
| 			WARN_ON(!page->mapping && irqs_disabled()); | 			if (!ro->op_write) | ||||||
| 				set_page_dirty(page); | 				set_page_dirty(page); | ||||||
| 		} |  | ||||||
| 			put_page(page); | 			put_page(page); | ||||||
| 		} | 		} | ||||||
|  | 	} | ||||||
| 
 | 
 | ||||||
| 	kfree(ro->op_notifier); | 	kfree(ro->op_notifier); | ||||||
| 	ro->op_notifier = NULL; | 	ro->op_notifier = NULL; | ||||||
| 	ro->op_active = 0; | 	ro->op_active = 0; | ||||||
|  | 	ro->op_odp_mr = NULL; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void rds_atomic_free_op(struct rm_atomic_op *ao) | void rds_atomic_free_op(struct rm_atomic_op *ao) | ||||||
|  | @ -583,6 +615,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, | ||||||
| 	struct rds_iovec *iovs; | 	struct rds_iovec *iovs; | ||||||
| 	unsigned int i, j; | 	unsigned int i, j; | ||||||
| 	int ret = 0; | 	int ret = 0; | ||||||
|  | 	bool odp_supported = true; | ||||||
| 
 | 
 | ||||||
| 	if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) | 	if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) | ||||||
| 	    || rm->rdma.op_active) | 	    || rm->rdma.op_active) | ||||||
|  | @ -604,6 +637,9 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, | ||||||
| 		ret = -EINVAL; | 		ret = -EINVAL; | ||||||
| 		goto out_ret; | 		goto out_ret; | ||||||
| 	} | 	} | ||||||
|  | 	/* odp-mr is not supported for multiple requests within one message */ | ||||||
|  | 	if (args->nr_local != 1) | ||||||
|  | 		odp_supported = false; | ||||||
| 
 | 
 | ||||||
| 	iovs = vec->iov; | 	iovs = vec->iov; | ||||||
| 
 | 
 | ||||||
|  | @ -625,6 +661,8 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, | ||||||
| 	op->op_silent = !!(args->flags & RDS_RDMA_SILENT); | 	op->op_silent = !!(args->flags & RDS_RDMA_SILENT); | ||||||
| 	op->op_active = 1; | 	op->op_active = 1; | ||||||
| 	op->op_recverr = rs->rs_recverr; | 	op->op_recverr = rs->rs_recverr; | ||||||
|  | 	op->op_odp_mr = NULL; | ||||||
|  | 
 | ||||||
| 	WARN_ON(!nr_pages); | 	WARN_ON(!nr_pages); | ||||||
| 	op->op_sg = rds_message_alloc_sgs(rm, nr_pages, &ret); | 	op->op_sg = rds_message_alloc_sgs(rm, nr_pages, &ret); | ||||||
| 	if (!op->op_sg) | 	if (!op->op_sg) | ||||||
|  | @ -674,10 +712,44 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, | ||||||
| 		 * If it's a READ operation, we need to pin the pages for writing. | 		 * If it's a READ operation, we need to pin the pages for writing. | ||||||
| 		 */ | 		 */ | ||||||
| 		ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write); | 		ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write); | ||||||
| 		if (ret < 0) | 		if ((!odp_supported && ret <= 0) || | ||||||
|  | 		    (odp_supported && ret <= 0 && ret != -EOPNOTSUPP)) | ||||||
| 			goto out_pages; | 			goto out_pages; | ||||||
| 		else | 
 | ||||||
| 			ret = 0; | 		if (ret == -EOPNOTSUPP) { | ||||||
|  | 			struct rds_mr *local_odp_mr; | ||||||
|  | 
 | ||||||
|  | 			if (!rs->rs_transport->get_mr) { | ||||||
|  | 				ret = -EOPNOTSUPP; | ||||||
|  | 				goto out_pages; | ||||||
|  | 			} | ||||||
|  | 			local_odp_mr = | ||||||
|  | 				kzalloc(sizeof(*local_odp_mr), GFP_KERNEL); | ||||||
|  | 			if (!local_odp_mr) { | ||||||
|  | 				ret = -ENOMEM; | ||||||
|  | 				goto out_pages; | ||||||
|  | 			} | ||||||
|  | 			RB_CLEAR_NODE(&local_odp_mr->r_rb_node); | ||||||
|  | 			refcount_set(&local_odp_mr->r_refcount, 1); | ||||||
|  | 			local_odp_mr->r_trans = rs->rs_transport; | ||||||
|  | 			local_odp_mr->r_sock = rs; | ||||||
|  | 			local_odp_mr->r_trans_private = | ||||||
|  | 				rs->rs_transport->get_mr( | ||||||
|  | 					NULL, 0, rs, &local_odp_mr->r_key, NULL, | ||||||
|  | 					iov->addr, iov->bytes, ODP_VIRTUAL); | ||||||
|  | 			if (IS_ERR(local_odp_mr->r_trans_private)) { | ||||||
|  | 				ret = IS_ERR(local_odp_mr->r_trans_private); | ||||||
|  | 				rdsdebug("get_mr ret %d %p\"", ret, | ||||||
|  | 					 local_odp_mr->r_trans_private); | ||||||
|  | 				kfree(local_odp_mr); | ||||||
|  | 				ret = -EOPNOTSUPP; | ||||||
|  | 				goto out_pages; | ||||||
|  | 			} | ||||||
|  | 			rdsdebug("Need odp; local_odp_mr %p trans_private %p\n", | ||||||
|  | 				 local_odp_mr, local_odp_mr->r_trans_private); | ||||||
|  | 			op->op_odp_mr = local_odp_mr; | ||||||
|  | 			op->op_odp_addr = iov->addr; | ||||||
|  | 		} | ||||||
| 
 | 
 | ||||||
| 		rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n", | 		rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n", | ||||||
| 			 nr_bytes, nr, iov->bytes, iov->addr); | 			 nr_bytes, nr, iov->bytes, iov->addr); | ||||||
|  | @ -693,6 +765,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, | ||||||
| 					min_t(unsigned int, iov->bytes, PAGE_SIZE - offset), | 					min_t(unsigned int, iov->bytes, PAGE_SIZE - offset), | ||||||
| 					offset); | 					offset); | ||||||
| 
 | 
 | ||||||
|  | 			sg_dma_len(sg) = sg->length; | ||||||
| 			rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n", | 			rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n", | ||||||
| 			       sg->offset, sg->length, iov->addr, iov->bytes); | 			       sg->offset, sg->length, iov->addr, iov->bytes); | ||||||
| 
 | 
 | ||||||
|  | @ -711,6 +784,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, | ||||||
| 		goto out_pages; | 		goto out_pages; | ||||||
| 	} | 	} | ||||||
| 	op->op_bytes = nr_bytes; | 	op->op_bytes = nr_bytes; | ||||||
|  | 	ret = 0; | ||||||
| 
 | 
 | ||||||
| out_pages: | out_pages: | ||||||
| 	kfree(pages); | 	kfree(pages); | ||||||
|  | @ -757,7 +831,8 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, | ||||||
| 	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); | 	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); | ||||||
| 
 | 
 | ||||||
| 	if (mr) { | 	if (mr) { | ||||||
| 		mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE); | 		mr->r_trans->sync_mr(mr->r_trans_private, | ||||||
|  | 				     DMA_TO_DEVICE); | ||||||
| 		rm->rdma.op_rdma_mr = mr; | 		rm->rdma.op_rdma_mr = mr; | ||||||
| 	} | 	} | ||||||
| 	return err; | 	return err; | ||||||
|  |  | ||||||
|  | @ -40,7 +40,6 @@ | ||||||
| #ifdef ATOMIC64_INIT | #ifdef ATOMIC64_INIT | ||||||
| #define KERNEL_HAS_ATOMIC64 | #define KERNEL_HAS_ATOMIC64 | ||||||
| #endif | #endif | ||||||
| 
 |  | ||||||
| #ifdef RDS_DEBUG | #ifdef RDS_DEBUG | ||||||
| #define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args) | #define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args) | ||||||
| #else | #else | ||||||
|  | @ -478,6 +477,9 @@ struct rds_message { | ||||||
| 			struct rds_notifier	*op_notifier; | 			struct rds_notifier	*op_notifier; | ||||||
| 
 | 
 | ||||||
| 			struct rds_mr		*op_rdma_mr; | 			struct rds_mr		*op_rdma_mr; | ||||||
|  | 
 | ||||||
|  | 			u64			op_odp_addr; | ||||||
|  | 			struct rds_mr		*op_odp_mr; | ||||||
| 		} rdma; | 		} rdma; | ||||||
| 		struct rm_data_op { | 		struct rm_data_op { | ||||||
| 			unsigned int		op_active:1; | 			unsigned int		op_active:1; | ||||||
|  | @ -573,7 +575,8 @@ struct rds_transport { | ||||||
| 	void (*exit)(void); | 	void (*exit)(void); | ||||||
| 	void *(*get_mr)(struct scatterlist *sg, unsigned long nr_sg, | 	void *(*get_mr)(struct scatterlist *sg, unsigned long nr_sg, | ||||||
| 			struct rds_sock *rs, u32 *key_ret, | 			struct rds_sock *rs, u32 *key_ret, | ||||||
| 			struct rds_connection *conn); | 			struct rds_connection *conn, | ||||||
|  | 			u64 start, u64 length, int need_odp); | ||||||
| 	void (*sync_mr)(void *trans_private, int direction); | 	void (*sync_mr)(void *trans_private, int direction); | ||||||
| 	void (*free_mr)(void *trans_private, int invalidate); | 	void (*free_mr)(void *trans_private, int invalidate); | ||||||
| 	void (*flush_mrs)(void); | 	void (*flush_mrs)(void); | ||||||
|  | @ -956,6 +959,12 @@ static inline bool rds_destroy_pending(struct rds_connection *conn) | ||||||
| 	       (conn->c_trans->t_unloading && conn->c_trans->t_unloading(conn)); | 	       (conn->c_trans->t_unloading && conn->c_trans->t_unloading(conn)); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | enum { | ||||||
|  | 	ODP_NOT_NEEDED, | ||||||
|  | 	ODP_ZEROBASED, | ||||||
|  | 	ODP_VIRTUAL | ||||||
|  | }; | ||||||
|  | 
 | ||||||
| /* stats.c */ | /* stats.c */ | ||||||
| DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); | DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); | ||||||
| #define rds_stats_inc_which(which, member) do {		\ | #define rds_stats_inc_which(which, member) do {		\ | ||||||
|  |  | ||||||
		Loading…
	
		Reference in a new issue
	
	 Hans Westgaard Ry
						Hans Westgaard Ry