/* * IO submission data structure (Submission Queue Entry) */ structio_uring_sqe { __u8 opcode; /* type of operation for this sqe */ __u8 flags; /* IOSQE_ flags */ __u16 ioprio; /* ioprio for the request */ __s32 fd; /* file descriptor to do IO on */ union { __u64 off; /* offset into file */ __u64 addr2; struct { __u32 cmd_op; __u32 __pad1; }; }; union { __u64 addr; /* pointer to buffer or iovecs */ __u64 splice_off_in; struct { __u32 level; __u32 optname; }; }; __u32 len; /* buffer size or number of iovecs */ union { __u32 rw_flags; __u32 fsync_flags; __u16 poll_events; /* compatibility */ __u32 poll32_events; /* word-reversed for BE */ __u32 sync_range_flags; __u32 msg_flags; __u32 timeout_flags; __u32 accept_flags; __u32 cancel_flags; __u32 open_flags; __u32 statx_flags; __u32 fadvise_advice; __u32 splice_flags; __u32 rename_flags; __u32 unlink_flags; __u32 hardlink_flags; __u32 xattr_flags; __u32 msg_ring_flags; __u32 uring_cmd_flags; __u32 waitid_flags; __u32 futex_flags; __u32 install_fd_flags; __u32 nop_flags; __u32 pipe_flags; }; __u64 user_data; /* data to be passed back at completion time */ /* pack this to avoid bogus arm OABI complaints */ union { /* index into fixed buffers, if used */ __u16 buf_index; /* for grouped buffer selection */ __u16 buf_group; } __attribute__((packed)); /* personality to use, if used */ __u16 personality; union { __s32 splice_fd_in; __u32 file_index; __u32 zcrx_ifq_idx; __u32 optlen; struct { __u16 addr_len; __u16 __pad3[1]; }; struct { __u8 write_stream; __u8 __pad4[3]; }; }; union { struct { __u64 addr3; __u64 __pad2[1]; }; struct { __u64 attr_ptr; /* pointer to attribute information */ __u64 attr_type_mask; /* bit mask of attributes */ }; __u64 optval; /* * If the ring is initialized with IORING_SETUP_SQE128, then * this field is used for 80 bytes of arbitrary command data */ __u8 cmd[0]; }; };
io_uring_cqe 结构体表示完成的请求 (Completion Queue Entry)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/* * IO completion data structure (Completion Queue Entry) */ structio_uring_cqe { __u64 user_data; /* sqe->user_data value passed back */ __s32 res; /* result code for this event */ __u32 flags;
/* * If the ring is initialized with IORING_SETUP_CQE32, then this field * contains 16-bytes of padding, doubling the size of the CQE. */ __u64 big_cqe[]; };
/* * This data is shared with the application through the mmap at offsets * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING. * * The offsets to the member fields are published through struct * io_sqring_offsets when calling io_uring_setup. */ structio_rings { /* * Head and tail offsets into the ring; the offsets need to be * masked to get valid indices. * * The kernel controls head of the sq ring and the tail of the cq ring, * and the application controls tail of the sq ring and the head of the * cq ring. */ structio_uringsq, cq; /* * Bitmasks to apply to head and tail offsets (constant, equals * ring_entries - 1) */ u32 sq_ring_mask, cq_ring_mask; /* Ring sizes (constant, power of 2) */ u32 sq_ring_entries, cq_ring_entries; /* * Number of invalid entries dropped by the kernel due to * invalid index stored in array * * Written by the kernel, shouldn't be modified by the * application (i.e. get number of "new events" by comparing to * cached value). * * After a new SQ head value was read by the application this * counter includes all submissions that were dropped reaching * the new SQ head (and possibly more). */ u32 sq_dropped; /* * Runtime SQ flags * * Written by the kernel, shouldn't be modified by the * application. * * The application needs a full memory barrier before checking * for IORING_SQ_NEED_WAKEUP after updating the sq tail. */ atomic_t sq_flags; /* * Runtime CQ flags * * Written by the application, shouldn't be modified by the * kernel. */ u32 cq_flags; /* * Number of completion events lost because the queue was full; * this should be avoided by the application by making sure * there are not more requests pending than there is space in * the completion queue. * * Written by the kernel, shouldn't be modified by the * application (i.e. get number of "new events" by comparing to * cached value). * * As completion events come in out of order this counter is not * ordered with any other data. */ u32 cq_overflow; /* * Ring buffer of completion events. * * The kernel writes completion events fresh every time they are * produced, so the application is allowed to modify pending * entries. */ structio_uring_cqecqes[] ____cacheline_aligned_in_smp; };
/* submission data */ struct { structmutexuring_lock;
/* * Ring buffer of indices into array of io_uring_sqe, which is * mmapped by the application using the IORING_OFF_SQES offset. * * This indirection could e.g. be used to assign fixed * io_uring_sqe entries to operations and only submit them to * the queue when needed. * * The kernel modifies neither the indices array nor the entries * array. */ u32 *sq_array; structio_uring_sqe *sq_sqes; unsigned cached_sq_head; unsigned sq_entries;
/* * Fixed resources fast path, should be accessed only under * uring_lock, and updated through io_uring_register(2) */ atomic_t cancel_seq;
/* * ->iopoll_list is protected by the ctx->uring_lock for * io_uring instances that don't use IORING_SETUP_SQPOLL. * For SQPOLL, only the single threaded io_sq_thread() will * manipulate the list, hence no extra locking is needed there. */ bool poll_multi_queue; structio_wq_work_listiopoll_list;
/* * Modifications are protected by ->uring_lock and ->mmap_lock. * The flags, buf_pages and buf_nr_pages fields should be stable * once published. */ structxarrayio_bl_xa;
/* * Any cancelable uring_cmd is added to this list in * ->uring_cmd() by io_uring_cmd_insert_cancelable() */ structhlist_headcancelable_uring_cmd; /* * For Hybrid IOPOLL, runtime in hybrid polling, without * scheduling time */ u64 hybrid_poll_time; } ____cacheline_aligned_in_smp; ... }
if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL)) ctx->lockless_cq = true;
/* * lazy poll_wq activation relies on ->task_complete for synchronisation * purposes, see io_activate_pollwq() */ if (!ctx->task_complete) ctx->poll_activated = true;
/* * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user * space applications don't need to do io completion events * polling again, they can rely on io_sq_thread to do polling * work, which can reduce cpu usage and uring_lock contention. */ if (ctx->flags & IORING_SETUP_IOPOLL && !(ctx->flags & IORING_SETUP_SQPOLL)) ctx->syscall_iopoll = 1;
ctx->compat = in_compat_syscall(); if (!ns_capable_noaudit(&init_user_ns, CAP_IPC_LOCK)) ctx->user = get_uid(current_user());
/* * For SQPOLL, we just need a wakeup, always. For !SQPOLL, if * COOP_TASKRUN is set, then IPIs are never needed by the app. */ if (ctx->flags & (IORING_SETUP_SQPOLL|IORING_SETUP_COOP_TASKRUN)) ctx->notify_method = TWA_SIGNAL_NO_IPI; else ctx->notify_method = TWA_SIGNAL;
/* * This is just grabbed for accounting purposes. When a process exits, * the mm is exited and dropped before the files, hence we need to hang * on to this mm purely for the purposes of being able to unaccount * memory (locked/pinned vm). It's not used for anything else. */ mmgrab(current->mm); ctx->mm_account = current->mm;
ret = io_allocate_scq_urings(ctx, p); // 分配 io_rings以及页面,这里会有很多内存分配 if (ret) goto err;
if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; goto err; }
if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !(ctx->flags & IORING_SETUP_R_DISABLED)) WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
file = io_uring_get_file(ctx); // 获取文件描述符 if (IS_ERR(file)) { ret = PTR_ERR(file); goto err; }
ret = __io_uring_add_tctx_node(ctx);// 分配进程上下文 if (ret) goto err_fput; tctx = current->io_uring;
/* * Install ring fd as the very last thing, so we don't risk someone * having closed it before we finish setup */ if (p->flags & IORING_SETUP_REGISTERED_FD_ONLY) ret = io_ring_add_registered_file(tctx, file, 0, IO_RINGFD_REG_MAX); else ret = io_uring_install_fd(file); if (ret < 0) goto err_fput;
/* * We don't quiesce the refs for register anymore and so it can't be * dying as we're holding a file ref here. */ if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs))) return -ENXIO;
if (ctx->submitter_task && ctx->submitter_task != current) return -EEXIST;
if (ctx->restricted) { opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); if (!test_bit(opcode, ctx->restrictions.register_op)) return -EACCES; }
switch (opcode) { case IORING_REGISTER_BUFFERS: ret = -EFAULT; if (!arg) break; ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL); break; case IORING_UNREGISTER_BUFFERS: ret = -EINVAL; if (arg || nr_args) break; ret = io_sqe_buffers_unregister(ctx); break; case IORING_REGISTER_FILES: ret = -EFAULT; if (!arg) break; ret = io_sqe_files_register(ctx, arg, nr_args, NULL); break; ....
/* * Ring fd has been registered via IORING_REGISTER_RING_FDS, we * need only dereference our task private array to find it. */ if (flags & IORING_ENTER_REGISTERED_RING) { structio_uring_task *tctx = current->io_uring;
if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) return -EINVAL; fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); file = tctx->registered_rings[fd]; if (unlikely(!file)) return -EBADF; } else { file = fget(fd); if (unlikely(!file)) return -EBADF; ret = -EOPNOTSUPP; if (unlikely(!io_is_uring_fops(file))) goto out; }
ctx = file->private_data; ret = -EBADFD; if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED)) goto out;
/* * For SQ polling, the thread will do all submissions and completions. * Just return the requested submit count, and wake the thread if * we were asked to. */ ret = 0; if (ctx->flags & IORING_SETUP_SQPOLL) { if (unlikely(ctx->sq_data->thread == NULL)) { ret = -EOWNERDEAD; goto out; } if (flags & IORING_ENTER_SQ_WAKEUP) wake_up(&ctx->sq_data->wait); if (flags & IORING_ENTER_SQ_WAIT) io_sqpoll_wait_sq(ctx);
ret = to_submit; } elseif (to_submit) { ret = io_uring_add_tctx_node(ctx); if (unlikely(ret)) goto out;
mutex_lock(&ctx->uring_lock); ret = io_submit_sqes(ctx, to_submit); if (ret != to_submit) { mutex_unlock(&ctx->uring_lock); goto out; } if (flags & IORING_ENTER_GETEVENTS) { if (ctx->syscall_iopoll) goto iopoll_locked; /* * Ignore errors, we'll soon call io_cqring_wait() and * it should handle ownership problems if any. */ if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) (void)io_run_local_work_locked(ctx, min_complete); } mutex_unlock(&ctx->uring_lock); }
if (flags & IORING_ENTER_GETEVENTS) { // 等待完成事件 int ret2;
if (ctx->syscall_iopoll) { /* * We disallow the app entering submit/complete with * polling, but we still need to lock the ring to * prevent racing with polled issue that got punted to * a workqueue. */ mutex_lock(&ctx->uring_lock); iopoll_locked: ret2 = io_validate_ext_arg(ctx, flags, argp, argsz); if (likely(!ret2)) ret2 = io_iopoll_check(ctx, min_complete); mutex_unlock(&ctx->uring_lock); } else { struct ext_arg ext_arg = { .argsz = argsz };
/* * EBADR indicates that one or more CQE were dropped. * Once the user has been informed we can clear the bit * as they are obviously ok with those drops. */ if (unlikely(ret2 == -EBADR)) clear_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq); } } out: if (!(flags & IORING_ENTER_REGISTERED_RING)) fput(file); return ret; }
if (ctx->user_bufs) return -EBUSY; if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) return -EINVAL; ret = io_rsrc_node_switch_start(ctx); if (ret) return ret; ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data); if (ret) return ret; ret = io_buffers_map_alloc(ctx, nr_args); if (ret) { io_rsrc_data_free(data); return ret; }
for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { if (arg) { ret = io_copy_iov(ctx, &iov, arg, i); if (ret) break; ret = io_buffer_validate(&iov); if (ret) break; } else { memset(&iov, 0, sizeof(iov)); }
if (!iov.iov_base && *io_get_tag_slot(data, i)) { ret = -EINVAL; break; }
ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i], &last_hpage); if (ret) break; }