io_uring: Add io_uring_setup flag to pre-register ring fd and never install it
With IORING_REGISTER_USE_REGISTERED_RING, an application can register the ring fd and use it via registered index rather than installed fd. This allows using a registered ring for everything *except* the initial mmap. With IORING_SETUP_NO_MMAP, io_uring_setup uses buffers allocated by the user, rather than requiring a subsequent mmap. The combination of the two allows a user to operate *entirely* via a registered ring fd, making it unnecessary to ever install the fd in the first place. So, add a flag IORING_SETUP_REGISTERED_FD_ONLY to make io_uring_setup register the fd and return a registered index, without installing the fd. This allows an application to avoid touching the fd table at all, and allows a library to never even momentarily install a file descriptor. This splits out an io_ring_add_registered_file helper from io_ring_add_registered_fd, for use by io_uring_setup. Signed-off-by: Josh Triplett <josh@joshtriplett.org> Link: https://lore.kernel.org/r/bc8f431bada371c183b95a83399628b605e978a3.1682699803.git.josh@joshtriplett.org Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
parent
03d89a2de2
commit
6e76ac5958
|
@ -178,6 +178,13 @@ enum {
|
||||||
*/
|
*/
|
||||||
#define IORING_SETUP_NO_MMAP (1U << 14)
|
#define IORING_SETUP_NO_MMAP (1U << 14)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Register the ring fd in itself for use with
|
||||||
|
* IORING_REGISTER_USE_REGISTERED_RING; return a registered fd index rather
|
||||||
|
* than an fd.
|
||||||
|
*/
|
||||||
|
#define IORING_SETUP_REGISTERED_FD_ONLY (1U << 15)
|
||||||
|
|
||||||
enum io_uring_op {
|
enum io_uring_op {
|
||||||
IORING_OP_NOP,
|
IORING_OP_NOP,
|
||||||
IORING_OP_READV,
|
IORING_OP_READV,
|
||||||
|
|
|
@ -3788,19 +3788,13 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
|
static int io_uring_install_fd(struct file *file)
|
||||||
{
|
{
|
||||||
int ret, fd;
|
int fd;
|
||||||
|
|
||||||
fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
|
fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
|
||||||
if (fd < 0)
|
if (fd < 0)
|
||||||
return fd;
|
return fd;
|
||||||
|
|
||||||
ret = __io_uring_add_tctx_node(ctx);
|
|
||||||
if (ret) {
|
|
||||||
put_unused_fd(fd);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
fd_install(fd, file);
|
fd_install(fd, file);
|
||||||
return fd;
|
return fd;
|
||||||
}
|
}
|
||||||
|
@ -3840,6 +3834,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
|
||||||
struct io_uring_params __user *params)
|
struct io_uring_params __user *params)
|
||||||
{
|
{
|
||||||
struct io_ring_ctx *ctx;
|
struct io_ring_ctx *ctx;
|
||||||
|
struct io_uring_task *tctx;
|
||||||
struct file *file;
|
struct file *file;
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
|
@ -3851,6 +3846,10 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
|
||||||
entries = IORING_MAX_ENTRIES;
|
entries = IORING_MAX_ENTRIES;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if ((p->flags & IORING_SETUP_REGISTERED_FD_ONLY)
|
||||||
|
&& !(p->flags & IORING_SETUP_NO_MMAP))
|
||||||
|
return -EINVAL;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Use twice as many entries for the CQ ring. It's possible for the
|
* Use twice as many entries for the CQ ring. It's possible for the
|
||||||
* application to drive a higher depth than the size of the SQ ring,
|
* application to drive a higher depth than the size of the SQ ring,
|
||||||
|
@ -4007,22 +4006,30 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
|
||||||
goto err;
|
goto err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ret = __io_uring_add_tctx_node(ctx);
|
||||||
|
if (ret)
|
||||||
|
goto err_fput;
|
||||||
|
tctx = current->io_uring;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Install ring fd as the very last thing, so we don't risk someone
|
* Install ring fd as the very last thing, so we don't risk someone
|
||||||
* having closed it before we finish setup
|
* having closed it before we finish setup
|
||||||
*/
|
*/
|
||||||
ret = io_uring_install_fd(ctx, file);
|
if (p->flags & IORING_SETUP_REGISTERED_FD_ONLY)
|
||||||
if (ret < 0) {
|
ret = io_ring_add_registered_file(tctx, file, 0, IO_RINGFD_REG_MAX);
|
||||||
/* fput will clean it up */
|
else
|
||||||
fput(file);
|
ret = io_uring_install_fd(file);
|
||||||
return ret;
|
if (ret < 0)
|
||||||
}
|
goto err_fput;
|
||||||
|
|
||||||
trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
|
trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
|
||||||
return ret;
|
return ret;
|
||||||
err:
|
err:
|
||||||
io_ring_ctx_wait_and_kill(ctx);
|
io_ring_ctx_wait_and_kill(ctx);
|
||||||
return ret;
|
return ret;
|
||||||
|
err_fput:
|
||||||
|
fput(file);
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -4049,7 +4056,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
|
||||||
IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
|
IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
|
||||||
IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
|
IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
|
||||||
IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN |
|
IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN |
|
||||||
IORING_SETUP_NO_MMAP))
|
IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY))
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
|
|
||||||
return io_uring_create(entries, &p, params);
|
return io_uring_create(entries, &p, params);
|
||||||
|
|
|
@ -75,6 +75,9 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
|
||||||
int io_uring_alloc_task_context(struct task_struct *task,
|
int io_uring_alloc_task_context(struct task_struct *task,
|
||||||
struct io_ring_ctx *ctx);
|
struct io_ring_ctx *ctx);
|
||||||
|
|
||||||
|
int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file,
|
||||||
|
int start, int end);
|
||||||
|
|
||||||
int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts);
|
int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts);
|
||||||
int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr);
|
int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr);
|
||||||
int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin);
|
int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin);
|
||||||
|
|
|
@ -208,29 +208,38 @@ void io_uring_unreg_ringfd(void)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file,
|
||||||
|
int start, int end)
|
||||||
|
{
|
||||||
|
int offset;
|
||||||
|
for (offset = start; offset < end; offset++) {
|
||||||
|
offset = array_index_nospec(offset, IO_RINGFD_REG_MAX);
|
||||||
|
if (tctx->registered_rings[offset])
|
||||||
|
continue;
|
||||||
|
|
||||||
|
tctx->registered_rings[offset] = file;
|
||||||
|
return offset;
|
||||||
|
}
|
||||||
|
return -EBUSY;
|
||||||
|
}
|
||||||
|
|
||||||
static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd,
|
static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd,
|
||||||
int start, int end)
|
int start, int end)
|
||||||
{
|
{
|
||||||
struct file *file;
|
struct file *file;
|
||||||
int offset;
|
int offset;
|
||||||
|
|
||||||
for (offset = start; offset < end; offset++) {
|
file = fget(fd);
|
||||||
offset = array_index_nospec(offset, IO_RINGFD_REG_MAX);
|
if (!file) {
|
||||||
if (tctx->registered_rings[offset])
|
return -EBADF;
|
||||||
continue;
|
} else if (!io_is_uring_fops(file)) {
|
||||||
|
fput(file);
|
||||||
file = fget(fd);
|
return -EOPNOTSUPP;
|
||||||
if (!file) {
|
|
||||||
return -EBADF;
|
|
||||||
} else if (!io_is_uring_fops(file)) {
|
|
||||||
fput(file);
|
|
||||||
return -EOPNOTSUPP;
|
|
||||||
}
|
|
||||||
tctx->registered_rings[offset] = file;
|
|
||||||
return offset;
|
|
||||||
}
|
}
|
||||||
|
offset = io_ring_add_registered_file(tctx, file, start, end);
|
||||||
return -EBUSY;
|
if (offset < 0)
|
||||||
|
fput(file);
|
||||||
|
return offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
Loading…
Reference in New Issue