diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2025-10-02 09:56:23 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2025-10-02 09:56:23 -0700 |
| commit | 5832d26433f2bd0d28f8b12526e3c2fdb203507f (patch) | |
| tree | c0cdd1df24131bee06e1318cd453e2790fdf654a /include | |
| parent | Merge tag 'bitmap-for-6.18' of https://github.com/norov/linux (diff) | |
| parent | io_uring/cmd: drop unused res2 param from io_uring_cmd_done() (diff) | |
| download | linux-5832d26433f2bd0d28f8b12526e3c2fdb203507f.tar.gz linux-5832d26433f2bd0d28f8b12526e3c2fdb203507f.zip | |
Merge tag 'for-6.18/io_uring-20250929' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux
Pull io_uring updates from Jens Axboe:
- Store ring provided buffers locally for the users, rather than stuff
them into struct io_kiocb.
These types of buffers must always be fully consumed or recycled in
the current context, and leaving them in struct io_kiocb is hence not
a good ideas as that struct has a vastly different life time.
Basically just an architecture cleanup that can help prevent issues
with ring provided buffers in the future.
- Support for mixed CQE sizes in the same ring.
Before this change, a CQ ring either used the default 16b CQEs, or it
was setup with 32b CQE using IORING_SETUP_CQE32. For use cases where
a few 32b CQEs were needed, this caused everything else to use big
CQEs. This is wasteful both in terms of memory usage, but also memory
bandwidth for the posted CQEs.
With IORING_SETUP_CQE_MIXED, applications may use request types that
post both normal 16b and big 32b CQEs on the same ring.
- Add helpers for async data management, to make it harder for opcode
handlers to mess it up.
- Add support for multishot for uring_cmd, which ublk can use. This
helps improve efficiency, by providing a persistent request type that
can trigger multiple CQEs.
- Add initial support for ring feature querying.
We had basic support for probe operations, but the API isn't great.
Rather than expand that, add support for QUERY which is easily
expandable and can cover a lot more cases than the existing probe
support. This will help applications get a better idea of what
operations are supported on a given host.
- zcrx improvements from Pavel:
- Improve refill entry alignment for better caching
- Various cleanups, especially around deduplicating normal
memory vs dmabuf setup.
- Generalisation of the niov size (Patch 12). It's still hard
coded to PAGE_SIZE on init, but will let the user to specify
the rx buffer length on setup.
- Syscall / synchronous bufer return. It'll be used as a slow
fallback path for returning buffers when the refill queue is
full. Useful for tolerating slight queue size misconfiguration
or with inconsistent load.
- Accounting more memory to cgroups.
- Additional independent cleanups that will also be useful for
mutli-area support.
- Various fixes and cleanups
* tag 'for-6.18/io_uring-20250929' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: (68 commits)
io_uring/cmd: drop unused res2 param from io_uring_cmd_done()
io_uring: fix nvme's 32b cqes on mixed cq
io_uring/query: cap number of queries
io_uring/query: prevent infinite loops
io_uring/zcrx: account niov arrays to cgroup
io_uring/zcrx: allow synchronous buffer return
io_uring/zcrx: introduce io_parse_rqe()
io_uring/zcrx: don't adjust free cache space
io_uring/zcrx: use guards for the refill lock
io_uring/zcrx: reduce netmem scope in refill
io_uring/zcrx: protect netdev with pp_lock
io_uring/zcrx: rename dma lock
io_uring/zcrx: make niov size variable
io_uring/zcrx: set sgt for umem area
io_uring/zcrx: remove dmabuf_offset
io_uring/zcrx: deduplicate area mapping
io_uring/zcrx: pass ifq to io_zcrx_alloc_fallback()
io_uring/zcrx: check all niovs filled with dma addresses
io_uring/zcrx: move area reg checks into io_import_area
io_uring/zcrx: don't pass slot to io_zcrx_create_area
...
Diffstat (limited to 'include')
| -rw-r--r-- | include/linux/io_uring/cmd.h | 69 | ||||
| -rw-r--r-- | include/linux/io_uring_types.h | 31 | ||||
| -rw-r--r-- | include/linux/poison.h | 3 | ||||
| -rw-r--r-- | include/trace/events/io_uring.h | 4 | ||||
| -rw-r--r-- | include/uapi/linux/io_uring.h | 38 | ||||
| -rw-r--r-- | include/uapi/linux/io_uring/query.h | 41 |
6 files changed, 151 insertions, 35 deletions
diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index cfa6d0c0c322..7509025b4071 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -11,11 +11,14 @@ /* io_uring_cmd is being issued again */ #define IORING_URING_CMD_REISSUE (1U << 31) +typedef void (*io_uring_cmd_tw_t)(struct io_uring_cmd *cmd, + unsigned issue_flags); + struct io_uring_cmd { struct file *file; const struct io_uring_sqe *sqe; /* callback to defer completions to task context */ - void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned); + io_uring_cmd_tw_t task_work_cb; u32 cmd_op; u32 flags; u8 pdu[32]; /* available inline for free use */ @@ -53,11 +56,11 @@ int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd, * Note: the caller should never hard code @issue_flags and is only allowed * to pass the mask provided by the core io_uring code. */ -void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, u64 res2, - unsigned issue_flags); +void __io_uring_cmd_done(struct io_uring_cmd *cmd, s32 ret, u64 res2, + unsigned issue_flags, bool is_cqe32); void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned), + io_uring_cmd_tw_t task_work_cb, unsigned flags); /* @@ -70,6 +73,21 @@ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, /* Execute the request from a blocking context */ void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd); +/* + * Select a buffer from the provided buffer group for multishot uring_cmd. + * Returns the selected buffer address and size. + */ +struct io_br_sel io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd, + unsigned buf_group, size_t *len, + unsigned int issue_flags); + +/* + * Complete a multishot uring_cmd event. This will post a CQE to the completion + * queue and update the provided buffer. + */ +bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd, + struct io_br_sel *sel, unsigned int issue_flags); + #else static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, @@ -86,13 +104,12 @@ static inline int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd, { return -EOPNOTSUPP; } -static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, - u64 ret2, unsigned issue_flags) +static inline void __io_uring_cmd_done(struct io_uring_cmd *cmd, s32 ret, + u64 ret2, unsigned issue_flags, bool is_cqe32) { } static inline void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned), - unsigned flags) + io_uring_cmd_tw_t task_work_cb, unsigned flags) { } static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, @@ -102,28 +119,28 @@ static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, static inline void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd) { } -#endif - -/* - * Polled completions must ensure they are coming from a poll queue, and - * hence are completed inside the usual poll handling loops. - */ -static inline void io_uring_cmd_iopoll_done(struct io_uring_cmd *ioucmd, - ssize_t ret, ssize_t res2) +static inline struct io_br_sel +io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd, unsigned buf_group, + size_t *len, unsigned int issue_flags) +{ + return (struct io_br_sel) { .val = -EOPNOTSUPP }; +} +static inline bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd, + struct io_br_sel *sel, unsigned int issue_flags) { - lockdep_assert(in_task()); - io_uring_cmd_done(ioucmd, ret, res2, 0); + return true; } +#endif /* users must follow the IOU_F_TWQ_LAZY_WAKE semantics */ static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)) + io_uring_cmd_tw_t task_work_cb) { __io_uring_cmd_do_in_task(ioucmd, task_work_cb, IOU_F_TWQ_LAZY_WAKE); } static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)) + io_uring_cmd_tw_t task_work_cb) { __io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0); } @@ -142,6 +159,18 @@ static inline void *io_uring_cmd_ctx_handle(struct io_uring_cmd *cmd) return cmd_to_io_kiocb(cmd)->ctx; } +static inline void io_uring_cmd_done(struct io_uring_cmd *ioucmd, s32 ret, + unsigned issue_flags) +{ + return __io_uring_cmd_done(ioucmd, ret, 0, issue_flags, false); +} + +static inline void io_uring_cmd_done32(struct io_uring_cmd *ioucmd, s32 ret, + u64 res2, unsigned issue_flags) +{ + return __io_uring_cmd_done(ioucmd, ret, res2, issue_flags, true); +} + int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, void (*release)(void *), unsigned int index, unsigned int issue_flags); diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 12f5ee43850e..c2ea6280901d 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -86,6 +86,25 @@ struct io_mapped_region { }; /* + * Return value from io_buffer_list selection, to avoid stashing it in + * struct io_kiocb. For legacy/classic provided buffers, keeping a reference + * across execution contexts are fine. But for ring provided buffers, the + * list may go away as soon as ->uring_lock is dropped. As the io_kiocb + * persists, it's better to just keep the buffer local for those cases. + */ +struct io_br_sel { + struct io_buffer_list *buf_list; + /* + * Some selection parts return the user address, others return an error. + */ + union { + void __user *addr; + ssize_t val; + }; +}; + + +/* * Arbitrary limit, can be raised if need be */ #define IO_RINGFD_REG_MAX 16 @@ -671,12 +690,6 @@ struct io_kiocb { /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ struct io_buffer *kbuf; - /* - * stores buffer ID for ring provided buffers, valid IFF - * REQ_F_BUFFER_RING is set. - */ - struct io_buffer_list *buf_list; - struct io_rsrc_node *buf_node; }; @@ -724,10 +737,4 @@ struct io_overflow_cqe { struct list_head list; struct io_uring_cqe cqe; }; - -static inline bool io_ctx_cqe32(struct io_ring_ctx *ctx) -{ - return ctx->flags & IORING_SETUP_CQE32; -} - #endif diff --git a/include/linux/poison.h b/include/linux/poison.h index 8ca2235f78d5..299e2dd7da6d 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -90,4 +90,7 @@ /********** lib/stackdepot.c **********/ #define STACK_DEPOT_POISON ((void *)(0xD390 + POISON_POINTER_DELTA)) +/********** io_uring/ **********/ +#define IO_URING_PTR_POISON ((void *)(0x1091UL + POISON_POINTER_DELTA)) + #endif diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index 178ab6f611be..45d15460b495 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -340,8 +340,8 @@ TP_PROTO(struct io_ring_ctx *ctx, void *req, struct io_uring_cqe *cqe), __entry->user_data = cqe->user_data; __entry->res = cqe->res; __entry->cflags = cqe->flags; - __entry->extra1 = io_ctx_cqe32(ctx) ? cqe->big_cqe[0] : 0; - __entry->extra2 = io_ctx_cqe32(ctx) ? cqe->big_cqe[1] : 0; + __entry->extra1 = ctx->flags & IORING_SETUP_CQE32 || cqe->flags & IORING_CQE_F_32 ? cqe->big_cqe[0] : 0; + __entry->extra2 = ctx->flags & IORING_SETUP_CQE32 || cqe->flags & IORING_CQE_F_32 ? cqe->big_cqe[1] : 0; ), TP_printk("ring %p, req %p, user_data 0x%llx, result %d, cflags 0x%x " diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 6957dc539d83..a0cc1cc0dd01 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -225,6 +225,12 @@ enum io_uring_sqe_flags_bit { /* Use hybrid poll in iopoll process */ #define IORING_SETUP_HYBRID_IOPOLL (1U << 17) +/* + * Allow both 16b and 32b CQEs. If a 32b CQE is posted, it will have + * IORING_CQE_F_32 set in cqe->flags. + */ +#define IORING_SETUP_CQE_MIXED (1U << 18) + enum io_uring_op { IORING_OP_NOP, IORING_OP_READV, @@ -298,9 +304,13 @@ enum io_uring_op { * sqe->uring_cmd_flags top 8bits aren't available for userspace * IORING_URING_CMD_FIXED use registered buffer; pass this flag * along with setting sqe->buf_index. + * IORING_URING_CMD_MULTISHOT must be used with buffer select, like other + * multishot commands. Not compatible with + * IORING_URING_CMD_FIXED, for now. */ #define IORING_URING_CMD_FIXED (1U << 0) -#define IORING_URING_CMD_MASK IORING_URING_CMD_FIXED +#define IORING_URING_CMD_MULTISHOT (1U << 1) +#define IORING_URING_CMD_MASK (IORING_URING_CMD_FIXED | IORING_URING_CMD_MULTISHOT) /* @@ -454,6 +464,7 @@ enum io_uring_msg_ring_flags { #define IORING_NOP_FIXED_FILE (1U << 2) #define IORING_NOP_FIXED_BUFFER (1U << 3) #define IORING_NOP_TW (1U << 4) +#define IORING_NOP_CQE32 (1U << 5) /* * IO completion data structure (Completion Queue Entry) @@ -487,12 +498,22 @@ struct io_uring_cqe { * other provided buffer type, all completions with a * buffer passed back is automatically returned to the * application. + * IORING_CQE_F_SKIP If set, then the application/liburing must ignore this + * CQE. It's only purpose is to fill a gap in the ring, + * if a large CQE is attempted posted when the ring has + * just a single small CQE worth of space left before + * wrapping. + * IORING_CQE_F_32 If set, this is a 32b/big-cqe posting. Use with rings + * setup in a mixed CQE mode, where both 16b and 32b + * CQEs may be posted to the CQ ring. */ #define IORING_CQE_F_BUFFER (1U << 0) #define IORING_CQE_F_MORE (1U << 1) #define IORING_CQE_F_SOCK_NONEMPTY (1U << 2) #define IORING_CQE_F_NOTIF (1U << 3) #define IORING_CQE_F_BUF_MORE (1U << 4) +#define IORING_CQE_F_SKIP (1U << 5) +#define IORING_CQE_F_32 (1U << 15) #define IORING_CQE_BUFFER_SHIFT 16 @@ -665,6 +686,12 @@ enum io_uring_register_op { IORING_REGISTER_MEM_REGION = 34, + /* query various aspects of io_uring, see linux/io_uring/query.h */ + IORING_REGISTER_QUERY = 35, + + /* return zcrx buffers back into circulation */ + IORING_REGISTER_ZCRX_REFILL = 36, + /* this goes last */ IORING_REGISTER_LAST, @@ -1046,6 +1073,15 @@ struct io_uring_zcrx_ifq_reg { __u64 __resv[3]; }; +struct io_uring_zcrx_sync_refill { + __u32 zcrx_id; + /* the number of entries to return */ + __u32 nr_entries; + /* pointer to an array of struct io_uring_zcrx_rqe */ + __u64 rqes; + __u64 __resv[2]; +}; + #ifdef __cplusplus } #endif diff --git a/include/uapi/linux/io_uring/query.h b/include/uapi/linux/io_uring/query.h new file mode 100644 index 000000000000..5d754322a27c --- /dev/null +++ b/include/uapi/linux/io_uring/query.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */ +/* + * Header file for the io_uring query interface. + */ +#ifndef LINUX_IO_URING_QUERY_H +#define LINUX_IO_URING_QUERY_H + +#include <linux/types.h> + +struct io_uring_query_hdr { + __u64 next_entry; + __u64 query_data; + __u32 query_op; + __u32 size; + __s32 result; + __u32 __resv[3]; +}; + +enum { + IO_URING_QUERY_OPCODES = 0, + + __IO_URING_QUERY_MAX, +}; + +/* Doesn't require a ring */ +struct io_uring_query_opcode { + /* The number of supported IORING_OP_* opcodes */ + __u32 nr_request_opcodes; + /* The number of supported IORING_[UN]REGISTER_* opcodes */ + __u32 nr_register_opcodes; + /* Bitmask of all supported IORING_FEAT_* flags */ + __u64 feature_flags; + /* Bitmask of all supported IORING_SETUP_* flags */ + __u64 ring_setup_flags; + /* Bitmask of all supported IORING_ENTER_** flags */ + __u64 enter_flags; + /* Bitmask of all supported IOSQE_* flags */ + __u64 sqe_flags; +}; + +#endif |
