forked from mirrors/linux
		
	io_uring/kbuf: use region api for pbuf rings
Convert internal parts of the provided buffer ring managment to the region API. It's the last non-region mapped ring we have, so it also kills a bunch of now unused memmap.c helpers. Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> Link: https://lore.kernel.org/r/6c40cf7beaa648558acd4d84bc0fb3279a35d74b.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
		
							parent
							
								
									90175f3f50
								
							
						
					
					
						commit
						ef62de3c4a
					
				
					 4 changed files with 74 additions and 241 deletions
				
			
		
							
								
								
									
										172
									
								
								io_uring/kbuf.c
									
									
									
									
									
								
							
							
						
						
									
										172
									
								
								io_uring/kbuf.c
									
									
									
									
									
								
							|  | @ -351,17 +351,7 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, | |||
| 
 | ||||
| 	if (bl->flags & IOBL_BUF_RING) { | ||||
| 		i = bl->buf_ring->tail - bl->head; | ||||
| 		if (bl->buf_nr_pages) { | ||||
| 			int j; | ||||
| 
 | ||||
| 			if (!(bl->flags & IOBL_MMAP)) { | ||||
| 				for (j = 0; j < bl->buf_nr_pages; j++) | ||||
| 					unpin_user_page(bl->buf_pages[j]); | ||||
| 			} | ||||
| 			io_pages_unmap(bl->buf_ring, &bl->buf_pages, | ||||
| 					&bl->buf_nr_pages, bl->flags & IOBL_MMAP); | ||||
| 			bl->flags &= ~IOBL_MMAP; | ||||
| 		} | ||||
| 		io_free_region(ctx, &bl->region); | ||||
| 		/* make sure it's seen as empty */ | ||||
| 		INIT_LIST_HEAD(&bl->buf_list); | ||||
| 		bl->flags &= ~IOBL_BUF_RING; | ||||
|  | @ -614,75 +604,14 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) | |||
| 	return IOU_OK; | ||||
| } | ||||
| 
 | ||||
| static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, | ||||
| 			    struct io_buffer_list *bl) | ||||
| { | ||||
| 	struct io_uring_buf_ring *br = NULL; | ||||
| 	struct page **pages; | ||||
| 	int nr_pages, ret; | ||||
| 
 | ||||
| 	pages = io_pin_pages(reg->ring_addr, | ||||
| 			     flex_array_size(br, bufs, reg->ring_entries), | ||||
| 			     &nr_pages); | ||||
| 	if (IS_ERR(pages)) | ||||
| 		return PTR_ERR(pages); | ||||
| 
 | ||||
| 	br = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); | ||||
| 	if (!br) { | ||||
| 		ret = -ENOMEM; | ||||
| 		goto error_unpin; | ||||
| 	} | ||||
| 
 | ||||
| #ifdef SHM_COLOUR | ||||
| 	/*
 | ||||
| 	 * On platforms that have specific aliasing requirements, SHM_COLOUR | ||||
| 	 * is set and we must guarantee that the kernel and user side align | ||||
| 	 * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and | ||||
| 	 * the application mmap's the provided ring buffer. Fail the request | ||||
| 	 * if we, by chance, don't end up with aligned addresses. The app | ||||
| 	 * should use IOU_PBUF_RING_MMAP instead, and liburing will handle | ||||
| 	 * this transparently. | ||||
| 	 */ | ||||
| 	if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) { | ||||
| 		ret = -EINVAL; | ||||
| 		goto error_unpin; | ||||
| 	} | ||||
| #endif | ||||
| 	bl->buf_pages = pages; | ||||
| 	bl->buf_nr_pages = nr_pages; | ||||
| 	bl->buf_ring = br; | ||||
| 	bl->flags |= IOBL_BUF_RING; | ||||
| 	bl->flags &= ~IOBL_MMAP; | ||||
| 	return 0; | ||||
| error_unpin: | ||||
| 	unpin_user_pages(pages, nr_pages); | ||||
| 	kvfree(pages); | ||||
| 	vunmap(br); | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx, | ||||
| 			      struct io_uring_buf_reg *reg, | ||||
| 			      struct io_buffer_list *bl) | ||||
| { | ||||
| 	size_t ring_size; | ||||
| 
 | ||||
| 	ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring); | ||||
| 
 | ||||
| 	bl->buf_ring = io_pages_map(&bl->buf_pages, &bl->buf_nr_pages, ring_size); | ||||
| 	if (IS_ERR(bl->buf_ring)) { | ||||
| 		bl->buf_ring = NULL; | ||||
| 		return -ENOMEM; | ||||
| 	} | ||||
| 
 | ||||
| 	bl->flags |= (IOBL_BUF_RING | IOBL_MMAP); | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) | ||||
| { | ||||
| 	struct io_uring_buf_reg reg; | ||||
| 	struct io_buffer_list *bl, *free_bl = NULL; | ||||
| 	struct io_uring_region_desc rd; | ||||
| 	struct io_uring_buf_ring *br; | ||||
| 	unsigned long mmap_offset; | ||||
| 	unsigned long ring_size; | ||||
| 	int ret; | ||||
| 
 | ||||
| 	lockdep_assert_held(&ctx->uring_lock); | ||||
|  | @ -694,19 +623,8 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) | |||
| 		return -EINVAL; | ||||
| 	if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC)) | ||||
| 		return -EINVAL; | ||||
| 	if (!(reg.flags & IOU_PBUF_RING_MMAP)) { | ||||
| 		if (!reg.ring_addr) | ||||
| 			return -EFAULT; | ||||
| 		if (reg.ring_addr & ~PAGE_MASK) | ||||
| 			return -EINVAL; | ||||
| 	} else { | ||||
| 		if (reg.ring_addr) | ||||
| 			return -EINVAL; | ||||
| 	} | ||||
| 
 | ||||
| 	if (!is_power_of_2(reg.ring_entries)) | ||||
| 		return -EINVAL; | ||||
| 
 | ||||
| 	/* cannot disambiguate full vs empty due to head/tail size */ | ||||
| 	if (reg.ring_entries >= 65536) | ||||
| 		return -EINVAL; | ||||
|  | @ -722,21 +640,47 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) | |||
| 			return -ENOMEM; | ||||
| 	} | ||||
| 
 | ||||
| 	if (!(reg.flags & IOU_PBUF_RING_MMAP)) | ||||
| 		ret = io_pin_pbuf_ring(®, bl); | ||||
| 	else | ||||
| 		ret = io_alloc_pbuf_ring(ctx, ®, bl); | ||||
| 	mmap_offset = reg.bgid << IORING_OFF_PBUF_SHIFT; | ||||
| 	ring_size = flex_array_size(br, bufs, reg.ring_entries); | ||||
| 
 | ||||
| 	if (!ret) { | ||||
| 		bl->nr_entries = reg.ring_entries; | ||||
| 		bl->mask = reg.ring_entries - 1; | ||||
| 		if (reg.flags & IOU_PBUF_RING_INC) | ||||
| 			bl->flags |= IOBL_INC; | ||||
| 
 | ||||
| 		io_buffer_add_list(ctx, bl, reg.bgid); | ||||
| 		return 0; | ||||
| 	memset(&rd, 0, sizeof(rd)); | ||||
| 	rd.size = PAGE_ALIGN(ring_size); | ||||
| 	if (!(reg.flags & IOU_PBUF_RING_MMAP)) { | ||||
| 		rd.user_addr = reg.ring_addr; | ||||
| 		rd.flags |= IORING_MEM_REGION_TYPE_USER; | ||||
| 	} | ||||
| 	ret = io_create_region_mmap_safe(ctx, &bl->region, &rd, mmap_offset); | ||||
| 	if (ret) | ||||
| 		goto fail; | ||||
| 	br = io_region_get_ptr(&bl->region); | ||||
| 
 | ||||
| #ifdef SHM_COLOUR | ||||
| 	/*
 | ||||
| 	 * On platforms that have specific aliasing requirements, SHM_COLOUR | ||||
| 	 * is set and we must guarantee that the kernel and user side align | ||||
| 	 * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and | ||||
| 	 * the application mmap's the provided ring buffer. Fail the request | ||||
| 	 * if we, by chance, don't end up with aligned addresses. The app | ||||
| 	 * should use IOU_PBUF_RING_MMAP instead, and liburing will handle | ||||
| 	 * this transparently. | ||||
| 	 */ | ||||
| 	if (!(reg.flags & IOU_PBUF_RING_MMAP) && | ||||
| 	    ((reg.ring_addr | (unsigned long)br) & (SHM_COLOUR - 1))) { | ||||
| 		ret = -EINVAL; | ||||
| 		goto fail; | ||||
| 	} | ||||
| #endif | ||||
| 
 | ||||
| 	bl->nr_entries = reg.ring_entries; | ||||
| 	bl->mask = reg.ring_entries - 1; | ||||
| 	bl->flags |= IOBL_BUF_RING; | ||||
| 	bl->buf_ring = br; | ||||
| 	if (reg.flags & IOU_PBUF_RING_INC) | ||||
| 		bl->flags |= IOBL_INC; | ||||
| 	io_buffer_add_list(ctx, bl, reg.bgid); | ||||
| 	return 0; | ||||
| fail: | ||||
| 	io_free_region(ctx, &bl->region); | ||||
| 	kfree(free_bl); | ||||
| 	return ret; | ||||
| } | ||||
|  | @ -794,32 +738,18 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg) | |||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, | ||||
| 				      unsigned long bgid) | ||||
| struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx, | ||||
| 					    unsigned int bgid) | ||||
| { | ||||
| 	struct io_buffer_list *bl; | ||||
| 
 | ||||
| 	bl = xa_load(&ctx->io_bl_xa, bgid); | ||||
| 	/* must be a mmap'able buffer ring and have pages */ | ||||
| 	if (bl && bl->flags & IOBL_MMAP) | ||||
| 		return bl; | ||||
| 
 | ||||
| 	return ERR_PTR(-EINVAL); | ||||
| } | ||||
| 
 | ||||
| int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma) | ||||
| { | ||||
| 	struct io_ring_ctx *ctx = file->private_data; | ||||
| 	loff_t pgoff = vma->vm_pgoff << PAGE_SHIFT; | ||||
| 	struct io_buffer_list *bl; | ||||
| 	int bgid; | ||||
| 
 | ||||
| 	lockdep_assert_held(&ctx->mmap_lock); | ||||
| 
 | ||||
| 	bgid = (pgoff & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; | ||||
| 	bl = io_pbuf_get_bl(ctx, bgid); | ||||
| 	if (IS_ERR(bl)) | ||||
| 		return PTR_ERR(bl); | ||||
| 	bl = xa_load(&ctx->io_bl_xa, bgid); | ||||
| 	if (!bl || !(bl->flags & IOBL_BUF_RING)) | ||||
| 		return NULL; | ||||
| 	if (WARN_ON_ONCE(!io_region_is_set(&bl->region))) | ||||
| 		return NULL; | ||||
| 
 | ||||
| 	return io_uring_mmap_pages(ctx, vma, bl->buf_pages, bl->buf_nr_pages); | ||||
| 	return &bl->region; | ||||
| } | ||||
|  |  | |||
|  | @ -3,15 +3,13 @@ | |||
| #define IOU_KBUF_H | ||||
| 
 | ||||
| #include <uapi/linux/io_uring.h> | ||||
| #include <linux/io_uring_types.h> | ||||
| 
 | ||||
| enum { | ||||
| 	/* ring mapped provided buffers */ | ||||
| 	IOBL_BUF_RING	= 1, | ||||
| 	/* ring mapped provided buffers, but mmap'ed by application */ | ||||
| 	IOBL_MMAP	= 2, | ||||
| 	/* buffers are consumed incrementally rather than always fully */ | ||||
| 	IOBL_INC	= 4, | ||||
| 
 | ||||
| 	IOBL_INC	= 2, | ||||
| }; | ||||
| 
 | ||||
| struct io_buffer_list { | ||||
|  | @ -21,10 +19,7 @@ struct io_buffer_list { | |||
| 	 */ | ||||
| 	union { | ||||
| 		struct list_head buf_list; | ||||
| 		struct { | ||||
| 			struct page **buf_pages; | ||||
| 			struct io_uring_buf_ring *buf_ring; | ||||
| 		}; | ||||
| 		struct io_uring_buf_ring *buf_ring; | ||||
| 	}; | ||||
| 	__u16 bgid; | ||||
| 
 | ||||
|  | @ -35,6 +30,8 @@ struct io_buffer_list { | |||
| 	__u16 mask; | ||||
| 
 | ||||
| 	__u16 flags; | ||||
| 
 | ||||
| 	struct io_mapped_region region; | ||||
| }; | ||||
| 
 | ||||
| struct io_buffer { | ||||
|  | @ -81,9 +78,8 @@ void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags); | |||
| 
 | ||||
| bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); | ||||
| 
 | ||||
| struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, | ||||
| 				      unsigned long bgid); | ||||
| int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma); | ||||
| struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx, | ||||
| 					    unsigned int bgid); | ||||
| 
 | ||||
| static inline bool io_kbuf_recycle_ring(struct io_kiocb *req) | ||||
| { | ||||
|  |  | |||
|  | @ -36,90 +36,6 @@ static void *io_mem_alloc_compound(struct page **pages, int nr_pages, | |||
| 	return page_address(page); | ||||
| } | ||||
| 
 | ||||
| static void *io_mem_alloc_single(struct page **pages, int nr_pages, size_t size, | ||||
| 				 gfp_t gfp) | ||||
| { | ||||
| 	void *ret; | ||||
| 	int i; | ||||
| 
 | ||||
| 	for (i = 0; i < nr_pages; i++) { | ||||
| 		pages[i] = alloc_page(gfp); | ||||
| 		if (!pages[i]) | ||||
| 			goto err; | ||||
| 	} | ||||
| 
 | ||||
| 	ret = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); | ||||
| 	if (ret) | ||||
| 		return ret; | ||||
| err: | ||||
| 	while (i--) | ||||
| 		put_page(pages[i]); | ||||
| 	return ERR_PTR(-ENOMEM); | ||||
| } | ||||
| 
 | ||||
| void *io_pages_map(struct page ***out_pages, unsigned short *npages, | ||||
| 		   size_t size) | ||||
| { | ||||
| 	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; | ||||
| 	struct page **pages; | ||||
| 	int nr_pages; | ||||
| 	void *ret; | ||||
| 
 | ||||
| 	nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||||
| 	pages = kvmalloc_array(nr_pages, sizeof(struct page *), gfp); | ||||
| 	if (!pages) | ||||
| 		return ERR_PTR(-ENOMEM); | ||||
| 
 | ||||
| 	ret = io_mem_alloc_compound(pages, nr_pages, size, gfp); | ||||
| 	if (!IS_ERR(ret)) | ||||
| 		goto done; | ||||
| 	if (nr_pages == 1) | ||||
| 		goto fail; | ||||
| 
 | ||||
| 	ret = io_mem_alloc_single(pages, nr_pages, size, gfp); | ||||
| 	if (!IS_ERR(ret)) { | ||||
| done: | ||||
| 		*out_pages = pages; | ||||
| 		*npages = nr_pages; | ||||
| 		return ret; | ||||
| 	} | ||||
| fail: | ||||
| 	kvfree(pages); | ||||
| 	*out_pages = NULL; | ||||
| 	*npages = 0; | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages, | ||||
| 		    bool put_pages) | ||||
| { | ||||
| 	bool do_vunmap = false; | ||||
| 
 | ||||
| 	if (!ptr) | ||||
| 		return; | ||||
| 
 | ||||
| 	if (put_pages && *npages) { | ||||
| 		struct page **to_free = *pages; | ||||
| 		int i; | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * Only did vmap for the non-compound multiple page case. | ||||
| 		 * For the compound page, we just need to put the head. | ||||
| 		 */ | ||||
| 		if (PageCompound(to_free[0])) | ||||
| 			*npages = 1; | ||||
| 		else if (*npages > 1) | ||||
| 			do_vunmap = true; | ||||
| 		for (i = 0; i < *npages; i++) | ||||
| 			put_page(to_free[i]); | ||||
| 	} | ||||
| 	if (do_vunmap) | ||||
| 		vunmap(ptr); | ||||
| 	kvfree(*pages); | ||||
| 	*pages = NULL; | ||||
| 	*npages = 0; | ||||
| } | ||||
| 
 | ||||
| struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages) | ||||
| { | ||||
| 	unsigned long start, end, nr_pages; | ||||
|  | @ -374,16 +290,14 @@ static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff, | |||
| 			return ERR_PTR(-EFAULT); | ||||
| 		return ctx->sq_sqes; | ||||
| 	case IORING_OFF_PBUF_RING: { | ||||
| 		struct io_buffer_list *bl; | ||||
| 		struct io_mapped_region *region; | ||||
| 		unsigned int bgid; | ||||
| 		void *ptr; | ||||
| 
 | ||||
| 		bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; | ||||
| 		bl = io_pbuf_get_bl(ctx, bgid); | ||||
| 		if (IS_ERR(bl)) | ||||
| 			return bl; | ||||
| 		ptr = bl->buf_ring; | ||||
| 		return ptr; | ||||
| 		region = io_pbuf_get_region(ctx, bgid); | ||||
| 		if (!region) | ||||
| 			return ERR_PTR(-EINVAL); | ||||
| 		return io_region_validate_mmap(ctx, region); | ||||
| 		} | ||||
| 	case IORING_MAP_OFF_PARAM_REGION: | ||||
| 		return io_region_validate_mmap(ctx, &ctx->param_region); | ||||
|  | @ -392,15 +306,6 @@ static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff, | |||
| 	return ERR_PTR(-EINVAL); | ||||
| } | ||||
| 
 | ||||
| int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, | ||||
| 			struct page **pages, int npages) | ||||
| { | ||||
| 	unsigned long nr_pages = npages; | ||||
| 
 | ||||
| 	vm_flags_set(vma, VM_DONTEXPAND); | ||||
| 	return vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); | ||||
| } | ||||
| 
 | ||||
| #ifdef CONFIG_MMU | ||||
| 
 | ||||
| static int io_region_mmap(struct io_ring_ctx *ctx, | ||||
|  | @ -435,8 +340,17 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) | |||
| 		return io_region_mmap(ctx, &ctx->ring_region, vma, page_limit); | ||||
| 	case IORING_OFF_SQES: | ||||
| 		return io_region_mmap(ctx, &ctx->sq_region, vma, UINT_MAX); | ||||
| 	case IORING_OFF_PBUF_RING: | ||||
| 		return io_pbuf_mmap(file, vma); | ||||
| 	case IORING_OFF_PBUF_RING: { | ||||
| 		struct io_mapped_region *region; | ||||
| 		unsigned int bgid; | ||||
| 
 | ||||
| 		bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; | ||||
| 		region = io_pbuf_get_region(ctx, bgid); | ||||
| 		if (!region) | ||||
| 			return -EINVAL; | ||||
| 
 | ||||
| 		return io_region_mmap(ctx, region, vma, UINT_MAX); | ||||
| 	} | ||||
| 	case IORING_MAP_OFF_PARAM_REGION: | ||||
| 		return io_region_mmap(ctx, &ctx->param_region, vma, UINT_MAX); | ||||
| 	} | ||||
|  |  | |||
|  | @ -4,13 +4,6 @@ | |||
| #define IORING_MAP_OFF_PARAM_REGION		0x20000000ULL | ||||
| 
 | ||||
| struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); | ||||
| int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, | ||||
| 			struct page **pages, int npages); | ||||
| 
 | ||||
| void *io_pages_map(struct page ***out_pages, unsigned short *npages, | ||||
| 		   size_t size); | ||||
| void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages, | ||||
| 		    bool put_pages); | ||||
| 
 | ||||
| #ifndef CONFIG_MMU | ||||
| unsigned int io_uring_nommu_mmap_capabilities(struct file *file); | ||||
|  |  | |||
		Loading…
	
		Reference in a new issue
	
	 Pavel Begunkov
						Pavel Begunkov