mirror of
https://github.com/torvalds/linux.git
synced 2025-10-30 00:06:59 +02:00
-----BEGIN PGP SIGNATURE-----
iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmjbLEcQHGF4Ym9lQGtl
cm5lbC5kawAKCRD301j7KXHgpnEUD/4/FgfQP2LFS/88BBF5ukZjRySe4wmyyZ2Q
MFh2ehdxzkZxVXjbeA2wRAXdqjw2MbNhx8tzU9VrW7rweNDZxHbwi6jJIP7OAjxE
4ZP0goAQj7P0TFyXC2KGj7k6dP20FkAltx5gGLVwsuOWDDrQKp2EykAcRnGYAD4W
3yf+nojVr2bjHyO7dx8dM7jUDjMg7J8nmHD6zgHOlHRLblWwfzw907bhz+eBX/FI
9kYvtX2c9MgY4Isa+43rZd5qvj9S3Cs8PD6tFPbq+n+3l7yWgMBTu/y+SNI8hupT
W7CqjPcpvppFHhPkcXDA3yARnW7ccEx5aiQuvUCmRUioHtGwXvC63HMp8OjcQspV
NNoIHYFsi1alzYq2kJLxY1IleWZ8j0hUkSSU8u7al8VIvtD43LGkv51xavxQUFjg
BO9mLyS51H2agffySs4vhHJE82lZizvmh/RJfSJ0ezALzE2k42MrximX1D1rBJE6
KPOhCiPt/jqpQMyqDYnY10FgTXQVwgPIVH1JLpo611tPFHlGW8Y4YxxR1Xduh5JX
jbGLEjVREsDZ7EHrimLNLmJRAQpyQujv/yhf7k96gWBelVwVuISQLI4Ca5IeVQyk
9yifgLXNGddgAwj0POMFeKXSm2We9nrrPDYLCKrsBMSN96/3SLveJC7fkW88aUZr
ye4/K8Y3vA==
=uc/3
-----END PGP SIGNATURE-----
Merge tag 'for-6.18/io_uring-20250929' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux
Pull io_uring updates from Jens Axboe:
- Store ring provided buffers locally for the users, rather than stuff
them into struct io_kiocb.
These types of buffers must always be fully consumed or recycled in
the current context, and leaving them in struct io_kiocb is hence not
a good ideas as that struct has a vastly different life time.
Basically just an architecture cleanup that can help prevent issues
with ring provided buffers in the future.
- Support for mixed CQE sizes in the same ring.
Before this change, a CQ ring either used the default 16b CQEs, or it
was setup with 32b CQE using IORING_SETUP_CQE32. For use cases where
a few 32b CQEs were needed, this caused everything else to use big
CQEs. This is wasteful both in terms of memory usage, but also memory
bandwidth for the posted CQEs.
With IORING_SETUP_CQE_MIXED, applications may use request types that
post both normal 16b and big 32b CQEs on the same ring.
- Add helpers for async data management, to make it harder for opcode
handlers to mess it up.
- Add support for multishot for uring_cmd, which ublk can use. This
helps improve efficiency, by providing a persistent request type that
can trigger multiple CQEs.
- Add initial support for ring feature querying.
We had basic support for probe operations, but the API isn't great.
Rather than expand that, add support for QUERY which is easily
expandable and can cover a lot more cases than the existing probe
support. This will help applications get a better idea of what
operations are supported on a given host.
- zcrx improvements from Pavel:
- Improve refill entry alignment for better caching
- Various cleanups, especially around deduplicating normal
memory vs dmabuf setup.
- Generalisation of the niov size (Patch 12). It's still hard
coded to PAGE_SIZE on init, but will let the user to specify
the rx buffer length on setup.
- Syscall / synchronous bufer return. It'll be used as a slow
fallback path for returning buffers when the refill queue is
full. Useful for tolerating slight queue size misconfiguration
or with inconsistent load.
- Accounting more memory to cgroups.
- Additional independent cleanups that will also be useful for
mutli-area support.
- Various fixes and cleanups
* tag 'for-6.18/io_uring-20250929' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: (68 commits)
io_uring/cmd: drop unused res2 param from io_uring_cmd_done()
io_uring: fix nvme's 32b cqes on mixed cq
io_uring/query: cap number of queries
io_uring/query: prevent infinite loops
io_uring/zcrx: account niov arrays to cgroup
io_uring/zcrx: allow synchronous buffer return
io_uring/zcrx: introduce io_parse_rqe()
io_uring/zcrx: don't adjust free cache space
io_uring/zcrx: use guards for the refill lock
io_uring/zcrx: reduce netmem scope in refill
io_uring/zcrx: protect netdev with pp_lock
io_uring/zcrx: rename dma lock
io_uring/zcrx: make niov size variable
io_uring/zcrx: set sgt for umem area
io_uring/zcrx: remove dmabuf_offset
io_uring/zcrx: deduplicate area mapping
io_uring/zcrx: pass ifq to io_zcrx_alloc_fallback()
io_uring/zcrx: check all niovs filled with dma addresses
io_uring/zcrx: move area reg checks into io_import_area
io_uring/zcrx: don't pass slot to io_zcrx_create_area
...
139 lines
3.4 KiB
C
139 lines
3.4 KiB
C
#include <linux/kernel.h>
|
|
#include <linux/errno.h>
|
|
#include <linux/file.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/net.h>
|
|
#include <linux/io_uring.h>
|
|
|
|
#include "io_uring.h"
|
|
#include "notif.h"
|
|
#include "rsrc.h"
|
|
|
|
static const struct ubuf_info_ops io_ubuf_ops;
|
|
|
|
static void io_notif_tw_complete(struct io_kiocb *notif, io_tw_token_t tw)
|
|
{
|
|
struct io_notif_data *nd = io_notif_to_data(notif);
|
|
struct io_ring_ctx *ctx = notif->ctx;
|
|
|
|
lockdep_assert_held(&ctx->uring_lock);
|
|
|
|
do {
|
|
notif = cmd_to_io_kiocb(nd);
|
|
|
|
if (WARN_ON_ONCE(ctx != notif->ctx))
|
|
return;
|
|
lockdep_assert(refcount_read(&nd->uarg.refcnt) == 0);
|
|
|
|
if (unlikely(nd->zc_report) && (nd->zc_copied || !nd->zc_used))
|
|
notif->cqe.res |= IORING_NOTIF_USAGE_ZC_COPIED;
|
|
|
|
if (nd->account_pages && notif->ctx->user) {
|
|
__io_unaccount_mem(notif->ctx->user, nd->account_pages);
|
|
nd->account_pages = 0;
|
|
}
|
|
|
|
nd = nd->next;
|
|
io_req_task_complete(notif, tw);
|
|
} while (nd);
|
|
}
|
|
|
|
void io_tx_ubuf_complete(struct sk_buff *skb, struct ubuf_info *uarg,
|
|
bool success)
|
|
{
|
|
struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg);
|
|
struct io_kiocb *notif = cmd_to_io_kiocb(nd);
|
|
unsigned tw_flags;
|
|
|
|
if (nd->zc_report) {
|
|
if (success && !nd->zc_used && skb)
|
|
WRITE_ONCE(nd->zc_used, true);
|
|
else if (!success && !nd->zc_copied)
|
|
WRITE_ONCE(nd->zc_copied, true);
|
|
}
|
|
|
|
if (!refcount_dec_and_test(&uarg->refcnt))
|
|
return;
|
|
|
|
if (nd->head != nd) {
|
|
io_tx_ubuf_complete(skb, &nd->head->uarg, success);
|
|
return;
|
|
}
|
|
|
|
tw_flags = nd->next ? 0 : IOU_F_TWQ_LAZY_WAKE;
|
|
notif->io_task_work.func = io_notif_tw_complete;
|
|
__io_req_task_work_add(notif, tw_flags);
|
|
}
|
|
|
|
static int io_link_skb(struct sk_buff *skb, struct ubuf_info *uarg)
|
|
{
|
|
struct io_notif_data *nd, *prev_nd;
|
|
struct io_kiocb *prev_notif, *notif;
|
|
struct ubuf_info *prev_uarg = skb_zcopy(skb);
|
|
|
|
nd = container_of(uarg, struct io_notif_data, uarg);
|
|
notif = cmd_to_io_kiocb(nd);
|
|
|
|
if (!prev_uarg) {
|
|
net_zcopy_get(&nd->uarg);
|
|
skb_zcopy_init(skb, &nd->uarg);
|
|
return 0;
|
|
}
|
|
/* handle it separately as we can't link a notif to itself */
|
|
if (unlikely(prev_uarg == &nd->uarg))
|
|
return 0;
|
|
/* we can't join two links together, just request a fresh skb */
|
|
if (unlikely(nd->head != nd || nd->next))
|
|
return -EEXIST;
|
|
/* don't mix zc providers */
|
|
if (unlikely(prev_uarg->ops != &io_ubuf_ops))
|
|
return -EEXIST;
|
|
|
|
prev_nd = container_of(prev_uarg, struct io_notif_data, uarg);
|
|
prev_notif = cmd_to_io_kiocb(prev_nd);
|
|
|
|
/* make sure all noifications can be finished in the same task_work */
|
|
if (unlikely(notif->ctx != prev_notif->ctx ||
|
|
notif->tctx != prev_notif->tctx))
|
|
return -EEXIST;
|
|
|
|
nd->head = prev_nd->head;
|
|
nd->next = prev_nd->next;
|
|
prev_nd->next = nd;
|
|
net_zcopy_get(&nd->head->uarg);
|
|
return 0;
|
|
}
|
|
|
|
static const struct ubuf_info_ops io_ubuf_ops = {
|
|
.complete = io_tx_ubuf_complete,
|
|
.link_skb = io_link_skb,
|
|
};
|
|
|
|
struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
|
|
__must_hold(&ctx->uring_lock)
|
|
{
|
|
struct io_kiocb *notif;
|
|
struct io_notif_data *nd;
|
|
|
|
if (unlikely(!io_alloc_req(ctx, ¬if)))
|
|
return NULL;
|
|
notif->ctx = ctx;
|
|
notif->opcode = IORING_OP_NOP;
|
|
notif->flags = 0;
|
|
notif->file = NULL;
|
|
notif->tctx = current->io_uring;
|
|
io_get_task_refs(1);
|
|
notif->file_node = NULL;
|
|
notif->buf_node = NULL;
|
|
|
|
nd = io_notif_to_data(notif);
|
|
nd->zc_report = false;
|
|
nd->account_pages = 0;
|
|
nd->next = NULL;
|
|
nd->head = nd;
|
|
|
|
nd->uarg.flags = IO_NOTIF_UBUF_FLAGS;
|
|
nd->uarg.ops = &io_ubuf_ops;
|
|
refcount_set(&nd->uarg.refcnt, 1);
|
|
return notif;
|
|
}
|