Chapter 1. Introduction

1.1. What Is A Kernel Module?

Linux系统内核:宏内核(与微内核相对),使用内核模块实现动态模块的加载运行。

简介 优点 缺点
宏内核 将进程调度核心功能,驱动程序,网络协议、文件系统都放入内核态 效率高 一个出错就崩溃
微内核 只将核心功能放在内核态,其他放在用户态以进程形式运行 驱动程序出错不影响内核运行 效率低

LKM:用于扩展内核的功能,运行在内核态,为ELF二进制文件,如果不使用内核模块,会导致需要给内核添加功能时只能重新编译整个内核,不是很方便。

1.2. How Do Modules Get Into The Kernel?

使用lsmod命令查看已经加载到内核的模块(/proc/modules)。

使用modprobe命令加载指定模块到内核(自动查询模块的依赖关系进行加载/lib/modules/version/kernel/*/*.ko

使用insmod命令加载模块到内核,不会查询依赖关系(modprobe调用该命令)

1
2
3
4
5
insmod /lib/modules/2.6.11/kernel/fs/fat/fat.ko
insmod /lib/modules/2.6.11/kernel/fs/msdos/msdos.ko


modprobe msdos

华为路由器LKM列表

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
root@debian-mips:~# lsmod
Module Size Used by
msdos 7642 0
fat 54395 1 msdos
ipv6 312786 12
loop 13153 0
mtdchar 7769 0
cfi_cmdset_0001 26278 1
cfi_probe 3224 0
gen_probe 2353 1 cfi_probe
sg 30552 0
uhci_hcd 26706 0
cfi_util 4778 2 cfi_cmdset_0001,cfi_probe
ehci_hcd 49012 0
physmap 2637 0
sr_mod 16338 0
i2c_piix4 5936 0
mtd 18073 6 mtdchar,cfi_cmdset_0001,physmap
8139too 20824 0
psmouse 53122 0
usbcore 152780 3 uhci_hcd,ehci_hcd
chipreg 1474 2 cfi_probe,physmap
cdrom 38519 1 sr_mod
i2c_core 18515 1 i2c_piix4
8139cp 20460 0
serio_raw 4416 0
map_funcs 1034 1 physmap
evdev 8808 0
nls_base 6015 2 fat,usbcore

Chapter2.Hello World

2.1. Hello, World (part 1): The Simplest Module

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
/*  
* hello-1.c - The simplest kernel module.
*/
#include <linux/module.h> /* Needed by all modules */
#include <linux/kernel.h> /* Needed for KERN_INFO */

int init_module(void)
{
printk(KERN_INFO "Hello world 1.\n");

/*
* A non 0 return means init_module failed; module can't be loaded.
*/
return 0;
}

void cleanup_module(void)
{
printk(KERN_INFO "Goodbye world 1.\n");
}

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Rita Roseweisse");

必须至少有两个函数:一个名为init module()的“开始”(初始化)函数,它在模块被insmod时被调用;一个名为cleanup module()的“结束”(清理)函数,它在模块被rmmod之前被调用。

在内核版本2.3后可以不这么命名加载时入口函数为使用module_init指定入口函数,卸载时使用module_exit指定卸载函数。

【报错】:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
❯ sudo insmod ./hello-1.ko                                     
insmod: ERROR: could not insert module ./hello-1.ko: Invalid module format
~/Documents/LKM/writeup/code Py base 09:24:36
❯ sudo dmesg | grep modules
[ 715.059773] module: x86/modules: Skipping invalid relocation target, existing value is nonzero for type 1, loc 000000000fc2cc28, val ffffffffc0cb4040
[ 1165.465487] module: x86/modules: Skipping invalid relocation target, existing value is nonzero for type 1, loc 000000000fc2cc28, val ffffffffc0cb4040

❯ sudo insmod ./hello-1.ko
insmod: ERROR: could not insert module ./hello-1.ko: Invalid module format
~/Documents/LKM/writeup/code Py base 09:24:36
❯ sudo dmesg | grep modules
[ 715.059773] module: x86/modules: Skipping invalid relocation target, existing value is nonzero for type 1, loc 000000000fc2cc28, val ffffffffc0cb4040

# solution 注:如果重装的不是uname -r显示的版本,需要指定
sudo apt update && sudo apt upgrade
sudo apt remove --purge linux-headers-*
sudo apt autoremove && sudo apt autoclean
sudo apt install linux-headers-generic

2.1.1 PrintK()

注:打印函数与Printf函数不同(应用程序可以调用C标准库,但内核函数一般调用自己提供的函数)

printk函数可以用来进行内核调试,其可以打印信息到终端或日志中,打印信息分为几个等级。

【附:除Printk外的一些打印内核信息的函数】

2.2 编译内核模块

使用Makefile进行编译

Makefile:指定内核源码,编译参数,编译平台

1
2
3
4
5
6
7
obj-m += hello-1.o

all:
make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules

clean:
make -C /lib/modules/$(shell uname -r)/build M=$(PWD) clean

编译生成*.ko文件,即为内核模块。

使用modinfo hello-1.ko查看模块信息。

加载模块到内核insmod ./hello-1.ko

模块的加载流程

  • 驱动程序注册设备,创建系统信息(/sys/class/xxx
  • 注册设备使用的是模块文件名,要带.ko后缀
  • 当操作已插入内核的模块时,只需使用模块名
  • udev|mdev根据注册的设备信息,创建设备节点(/dev/xxx
  • 所有设备节点信息存储在/proc/devices
  • 加载后会输出模块内的加载信息,通过dmesg查看

init_module系统调用流程

  • 不依赖C库。链接/重定位自己完成
  • Kernel/module.c/init_module
  • 拷贝到内核:copy_module_from_user
  • 地址空间分配:layout_and_allocate
  • 符号解析:simplify_symbols
  • 重定位:apply_relocations
  • 执行:complete_formation

2.3 Hello World (part 2)

可以使用module_init(hello_2_init); module_exit(hello_2_exit);函数指定入口函数和退出函数。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25

/*
* hello-2.c - Demonstrating the module_init() and module_exit() macros.
* This is preferred over using init_module() and cleanup_module().
*/
#include <linux/module.h> /* Needed by all modules */
#include <linux/kernel.h> /* Needed for KERN_INFO */
#include <linux/init.h> /* Needed for the macros */

static int __init hello_2_init(void)
{
printk(KERN_INFO "Hello, world 2\n");
return 0;
}

static void __exit hello_2_exit(void)
{
printk(KERN_INFO "Goodbye, world 2\n");
}

module_init(hello_2_init);
module_exit(hello_2_exit);

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Rita Roseweisse");

2.4 Hello World (part 3): __init and _exit宏定义

__init宏导致一旦内置驱动程序的init函数完成,init函数就会被丢弃并释放其内存(对Loadable modules无效)

__exit宏导致函数遗漏,(对Loadable Modules无效)内置驱动程序不需要cleanup函数,但Loadable Modules需要。

以下代码定义了init, initdata, exit宏

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
/*  
* hello-3.c - Illustrating the __init, __initdata and __exit macros.
*/
#include <linux/module.h> /* Needed by all modules */
#include <linux/kernel.h> /* Needed for KERN_INFO */
#include <linux/init.h> /* Needed for the macros */

static int hello3_data __initdata = 3;

static int __init hello_3_init(void)
{
printk(KERN_INFO "Hello, world %d\n", hello3_data);
return 0;
}

static void __exit hello_3_exit(void)
{
printk(KERN_INFO "Goodbye, world 3\n");
}

module_init(hello_3_init);
module_exit(hello_3_exit);

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Rita Roseweisse");

修改Makefile,编译加载内核,查看记录信息:

1
[ 2774.912662] Hello, world 3

2.5 Hello World (part 4): Licensing and Module Documentation

  • Licensing:使用MODULE_LICENSE()宏定义。GPL代表模块开源免费
  • Description:使用MODULE_DESCRIPTION宏定义。描述模块用于做什么。
  • Author:使用MODULE_AUTHOR宏定义。描述作者
  • 支持设备:MODULE_SUPPORTED_DEVICE()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
/*  
* hello-4.c - Demonstrates module documentation.
*/
#include <linux/module.h> /* Needed by all modules */
#include <linux/kernel.h> /* Needed for KERN_INFO */
#include <linux/init.h> /* Needed for the macros */
#define DRIVER_AUTHOR "Peter Jay Salzman <p@dirac.org>"
#define DRIVER_DESC "A sample driver"

static int __init init_hello_4(void)
{
printk(KERN_INFO "Hello, world 4\n");
return 0;
}

static void __exit cleanup_hello_4(void)
{
printk(KERN_INFO "Goodbye, world 4\n");
}

module_init(init_hello_4);
module_exit(cleanup_hello_4);

/*
* You can use strings, like this:
*/

/*
* Get rid of taint message by declaring code as GPL.
*/
MODULE_LICENSE("GPL");

/*
* Or with defines, like this:
*/
MODULE_AUTHOR(DRIVER_AUTHOR); /* Who wrote this module? */
MODULE_DESCRIPTION(DRIVER_DESC); /* What does this module do */

/*
* This module uses /dev/testdevice. The MODULE_SUPPORTED_DEVICE macro might
* be used in the future to help automatic configuration of modules, but is
* currently unused other than for documentation purposes.
*/
MODULE_SUPPORTED_DEVICE("testdevice");

2.6 向内核模块传递命令行参数

需要将要传入参数的变量声明为全局变量,并且使用module_param()宏定义。运行时,insmod会将命令行参数传入。./insmod mymodule.ko myvariable=5**

module_param()宏接受3个参数:变量名,类型,对应文件

1
2
int myint = 3;
module_param(myint, int, 0);

数组变量格式稍有不同:

1
2
3
4
5
6
int myintarray[2];
module_param_array(myintarray, int, NULL, 0); /* not interested in count */

int myshortarray[4];
int count;
module_parm_array(myshortarray, short, , 0); /* put count into "count" variable */

MODULE_PARM_DESC()记录模块接受的参数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
/*
* hello-5.c - Demonstrates command line argument passing to a module.
*/
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/stat.h>

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Peter Jay Salzman");

static short int myshort = 1;
static int myint = 420;
static long int mylong = 9999;
static char *mystring = "blah";
static int myintArray[2] = { -1, -1 };
static int arr_argc = 0;

/*
* module_param(foo, int, 0000)
* The first param is the parameters name
* The second param is it's data type
* The final argument is the permissions bits,
* for exposing parameters in sysfs (if non-zero) at a later stage.
*/

module_param(myshort, short, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP);
MODULE_PARM_DESC(myshort, "A short integer");
module_param(myint, int, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
MODULE_PARM_DESC(myint, "An integer");
module_param(mylong, long, S_IRUSR);
MODULE_PARM_DESC(mylong, "A long integer");
module_param(mystring, charp, 0000);
MODULE_PARM_DESC(mystring, "A character string");

/*
* module_param_array(name, type, num, perm);
* The first param is the parameter's (in this case the array's) name
* The second param is the data type of the elements of the array
* The third argument is a pointer to the variable that will store the number
* of elements of the array initialized by the user at module loading time
* The fourth argument is the permission bits
*/
module_param_array(myintArray, int, &arr_argc, 0000);
MODULE_PARM_DESC(myintArray, "An array of integers");

static int __init hello_5_init(void)
{
int i;
printk(KERN_INFO "Hello, world 5\n=============\n");
printk(KERN_INFO "myshort is a short integer: %hd\n", myshort);
printk(KERN_INFO "myint is an integer: %d\n", myint);
printk(KERN_INFO "mylong is a long integer: %ld\n", mylong);
printk(KERN_INFO "mystring is a string: %s\n", mystring);
for (i = 0; i < (sizeof myintArray / sizeof (int)); i++)
{
printk(KERN_INFO "myintArray[%d] = %d\n", i, myintArray[i]);
}
printk(KERN_INFO "got %d arguments for myintArray.\n", arr_argc);
return 0;
}

static void __exit hello_5_exit(void)
{
printk(KERN_INFO "Goodbye, world 5\n");
}

module_init(hello_5_init);
module_exit(hello_5_exit);

编译生成的模块hello_5基本信息:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
modinfo hello-5.ko
filename: /home/bronya/Documents/LKM/writeup/code/hello-5.ko
author: Peter Jay Salzman
license: GPL
srcversion: A704327C32F7F311666C13C
depends:
retpoline: Y
name: hello_5
vermagic: 6.2.0-37-generic SMP preempt mod_unload modversions
parm: myshort:A short integer (short)
parm: myint:An integer (int)
parm: mylong:A long integer (long)
parm: mystring:A character string (charp)
parm: myintArray:An array of integers (array of int)

加载模块显示信息:

1
2
3
4
5
6
7
8
9
10
11
[ 4659.140006] hello_5: unknown parameter 'mybyte' ignored
[ 4659.140047] Hello, world 5
=============
[ 4659.140048] myshort is a short integer: 1
[ 4659.140049] myint is an integer: 420
[ 4659.140049] mylong is a long integer: 9999
[ 4659.140050] mystring is a string: bebop
[ 4659.140050] myintArray[0] = -1
[ 4659.140051] myintArray[1] = -1
[ 4659.140051] got 1 arguments for myintArray.

2.7 多文件编译模块

可以将内核模块分为多个源文件

例:一个模块分为start,stop,将入口函数和退出函数分离

1
2
3
4
5
6
7
8
9
10
11
12
/*
* start.c - Illustration of multi filed modules
*/

#include <linux/kernel.h> /* We're doing kernel work */
#include <linux/module.h> /* Specifically, a module */

int init_module(void)
{
printk(KERN_INFO "Hello, world - this is the kernel speaking\n");
return 0;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
/*
* stop.c - Illustration of multi filed modules
*/

#include <linux/kernel.h> /* We're doing kernel work */
#include <linux/module.h> /* Specifically, a module */

void cleanup_module()
{
printk(KERN_INFO "Short is the life of a kernel module\n");
}

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Rita Roseweisse");

Makefile

1
2
3
4
5
6
7
8
9
10
11
12
obj-m += hello-1.o
obj-m += hello-2.o
obj-m += hello-3.o
obj-m += hello-4.o
obj-m += hello-5.o
obj-m += startstop.o
startstop-objs := start.o stop.o

all:
make -C /lib/modules/$(shell uname -r)/build M=$(shell pwd) modules
clean:
make -C /lib/modules/$(shell uname -r)/build M=$(shell pwd) clean

2.8 重新编译内核

有利于版本的匹配以及内核模块的动态装载以及强制卸载。此处不再详细描述。想了解的读者可以参考原文(https://tldp.org/LDP/lkmpg/2.6/html/lkmpg.html#AEN380)

Chapter 3 初步知识

3.1 模块与程序

3.1.1 模块如何开始与结束

  • 程序通常使用main()函数作为入口函数,在执行完所有指令后退出。
  • 内核模块使用module__init指定入口函数。初始化操作向内核提供了模块的函数,当内核需要调用模块函数时才会真正执行模块中代码;模块结束使用module_exit,是init操作的反向操作。

3.1.2 模块可以调用的函数

程序可以调用标准C语言库,例如printf()

内核模块只能调用内核实现的函数,因为模块的目标文件的符号由insmod解析。符号的定义来自于内核本身,导致能够调用的函数只能是由内核提供的,即系统调用(write,ioctl,read)。内核到处的符号可以参见/proc/kallsyms

库函数运行在用户空间,向开发者提供了系统调用更方便的接口(库函数仍然需要调用系统调用实现)。系统调用运行在内核态。

1
2
3
#include <stdio.h>
int main(void)
{ printf("hello"); return 0; }

使用gcc -Wall -o test test.c编译,strace ./hello可以看到输出最后有write(1, "hello", 5hello)为printf使用的系统调用。

man 2 write:查看write函数的使用说明,2代表系统调用(kill(), read());3代表库函数

可以使用模块来替代内核的系统调用,基于此可以实现插入后门或木马等操作。

3.1.3 用户空间与内核空间

内核态与用户态的转换

3.1.4 命名空间

开发者自己开发的变量名不能与其他开发者的冲突,因此会有命名空间来避免(C++有std::,C里这里只提到了要多注意,应该是没有)

当写内核模块时,由于内核模块会被加载到整个内核,因此更需要进行命名管理。最好的方式是将所有变量声明为static且使用合适的前缀命名变量。习惯上,Linux内核变量名一般为小写形式。(还可以定义一个符号表,后面会提到)

3.1.5 代码空间

内存管理是十分复杂的问题。这里只探究写内核模块需要考虑的问题

每开启一个新进程,内核会为其分配一个真实的物理内存,内存在进程看来为虚拟内存,从0x00000000开始,不同进程的内存地址(0xbffff978)对应的实际地址并不相同。每个进程的虚拟内存与实际内存之间存在着特定偏移。且不同进程无法访问其他进程的内存地址。

内核也有自己的代码空间。由于内核模块是加载到内核中使用,其会与内核共享代码空间。因此如果内核模块内存错误,内核也会出现内存错误。

以上为针对宏内核操作系统,微内核操作系统每个模块有独立的代码空间(GNU Hurd, QNX Neutrino)。

3.1.6 Device Drivers

一类模块是设备驱动程序,为硬件提供功能。在unix系统中,每一个硬件都由/dev中的文件表示。设备驱动程序可以代表用户程序与硬件进行通信。例如es1370.o声卡驱动程序可以将/dev/sound连接到Ensoniq IS1370声卡。用户空间的程序可以直接使用/dev/sound不需要考虑声卡类型。

3.1.6.1 Major and Minor Numbers
1
2
3
4
❯ ls -l /dev/sda[1-3]
brw-rw---- 1 root disk 8, 1 1月 5 09:11 /dev/sda1
brw-rw---- 1 root disk 8, 2 1月 5 09:11 /dev/sda2
brw-rw---- 1 root disk 8, 3 1月 5 09:11 /dev/sda3

上图表示了硬盘的前三个分区,其中由逗号分隔的数字前面的为Major Number,后面为Minor Number。Major Number表示哪一个设备驱动访问该硬件,每一个设备驱动都有特定的major number。

Minor Number用来分别同一驱动控制的不同设备。

设备分为两类:字符设备和块设备。块设备有一个缓冲区存放请求(可以对请求进行排序)存储设备;字符设备则没有缓冲区。可以通过查看ls -l中第一个字符为’b’还是’c’来分辨。

可以通过查阅/usr/src/linux/Documentation/devices.txt来了解major number对应的设备

Chapter 4 字符设备文件

4.1 字符设备驱动程序

4.1.1 file_operations 结构体

提供了设备驱动程序对设备的多种操作函数的地址,操作函数定义在内核模块中。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33

struct file_operations {
struct module *owner;
loff_t(*llseek) (struct file *, loff_t, int);
ssize_t(*read) (struct file *, char __user *, size_t, loff_t *);
ssize_t(*aio_read) (struct kiocb *, char __user *, size_t, loff_t);
ssize_t(*write) (struct file *, const char __user *, size_t, loff_t *);
ssize_t(*aio_write) (struct kiocb *, const char __user *, size_t,
loff_t);
int (*readdir) (struct file *, void *, filldir_t);
unsigned int (*poll) (struct file *, struct poll_table_struct *);
int (*ioctl) (struct inode *, struct file *, unsigned int,
unsigned long);
int (*mmap) (struct file *, struct vm_area_struct *);
int (*open) (struct inode *, struct file *);
int (*flush) (struct file *);
int (*release) (struct inode *, struct file *);
int (*fsync) (struct file *, struct dentry *, int datasync);
int (*aio_fsync) (struct kiocb *, int datasync);
int (*fasync) (int, struct file *, int);
int (*lock) (struct file *, int, struct file_lock *);
ssize_t(*readv) (struct file *, const struct iovec *, unsigned long,
loff_t *);
ssize_t(*writev) (struct file *, const struct iovec *, unsigned long,
loff_t *);
ssize_t(*sendfile) (struct file *, loff_t *, size_t, read_actor_t,
void __user *);
ssize_t(*sendpage) (struct file *, struct page *, int, size_t,
loff_t *, int);
unsigned long (*get_unmapped_area) (struct file *, unsigned long,
unsigned long, unsigned long,
unsigned long);
};

改进

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
// gcc extension
struct file_operations fops = {
read: device_read,
write: device_write,
open: device_open,
release: device_release
};

// C99 way
struct file_operations fops = {
.read = device_read,
.write = device_write,
.open = device_open,
.release = device_release
};

4.1.2 file结构体

每个设备在内核中由file结构体表示。该结构是内核水平的结构体,不会再用户空间出现。与glibc定义的FILE结构体不同。另外,其代表的是抽象的打开file,而不是硬盘上的file文件(使用inode结构体表示)。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
/*
* f_{lock,count,pos_lock} members can be highly contended and share
* the same cacheline. f_{lock,mode} are very frequently used together
* and so share the same cacheline as well. The read-mostly
* f_{path,inode,op} are kept on a separate cacheline.
*/
struct file {
union {
struct llist_node f_llist;
struct rcu_head f_rcuhead;
unsigned int f_iocb_flags;
};

/*
* Protects f_ep, f_flags.
* Must not be taken from IRQ context.
*/
spinlock_t f_lock;
fmode_t f_mode;
atomic_long_t f_count;
struct mutex f_pos_lock;
loff_t f_pos;
unsigned int f_flags;
struct fown_struct f_owner;
const struct cred *f_cred;
struct file_ra_state f_ra;
struct path f_path;
struct inode *f_inode; /* cached value */
const struct file_operations *f_op;

u64 f_version;
#ifdef CONFIG_SECURITY
void *f_security;
#endif
/* needed for tty driver, and maybe others */
void *private_data;

#ifdef CONFIG_EPOLL
/* Used by fs/eventpoll.c to link all the hooks to this file */
struct hlist_head *f_ep;
#endif /* #ifdef CONFIG_EPOLL */
struct address_space *f_mapping;
errseq_t f_wb_err;
errseq_t f_sb_err; /* for syncfs */
} __randomize_layout
__attribute__((aligned(4))); /* lest something weird decides that 2 is OK */

4.1.3 Registering A Device

添加一个驱动需要向内核进行注册。该操作会向其分配一个major number(没有minor number因为其只与驱动有关,与内核无关)。使用register_chrdev函数 (linux/fs.h) 进行注册

1
2
int register_chrdev(unsigned int major, const char *name, struct file_operations *fops);
// 返回值为负表明注册失败

unsigned int major: 请求的major number

char *name: 设备驱动的名字,会保存在/proc/devices中。

struct file_operations *fops: 是file_operations表的指针

注:为了保证请求的major number不与已有的设备驱动冲突,可以 1)查看/Documentation/devices.txt;2)传入major number 0.会返回动态分配的major number。缺点是不能提前新建驱动文件。可以手动创建或构造脚本,或者使用mknod命令自动创建

注:存在一种方式自动创建设备class_create(),Linux内核提供一组函数,可以用来在模块加载的时候自动在/dev目录下创建相应的设备节点,并在卸载模块时删除该节点,当然前提条件是用户空间移植了udev(一个用户空间程序)。内核中定义了struct class结构体,顾名思义,一个struct class结构体类型变量对应一个类,内核同时提供了class_create()函数,可以用它来创建一个类,这个类存放在sysfs下,一旦创建好后,再调用device_create()函数来在/dev目录下创建相应的设备节点。这样,加载模块的时候,用户空间中的udev会自动响应device_create()函数,去/sysfs下寻找对应的类从而创建设备节点。i

4.1.4 Unregistering A Device

我们不能让root用户需要需要使用内核模块时将其卸载。如果设备驱动文件被一个进程打开然后将其内核模块卸载,使用该文件会造成对原内核模块对应的内存地址的访问,但由于内存地址已发生改变,会导致意想不到的结果。

一般情况下,可以让函数返回负值来禁用某功能,不过cleaup_module无法实现因为其为void函数。不过,存在一个计数器来记录多少进程在占用模块。通过/proc/modules的第三个参数即可。若不为零,则无法卸载(该操作内核已实现)

可以使用以下函数来改变计数器值:

try_module_get(THIS_MODULE): 增加计数

module_put(THIS_MODULE): 减少计数

4.1.5 chardev.c

以下代码创建了一个字符类设备驱动chardev。该设备文件可以记录设备文件被读取的次数。无法写入该设备文件,不过会记录此行为并告知用户该行为不受支持。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
/*
* chardev.c: Creates a read-only char device that says how many times
* you've read from the dev file
*/

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <asm/uaccess.h> /* for put_user */

/*
* Prototypes - this would normally go in a .h file
*/
int init_module(void);
void cleanup_module(void);
static int device_open(struct inode *, struct file *);
static int device_release(struct inode *, struct file *);
static ssize_t device_read(struct file *, char *, size_t, loff_t *);
static ssize_t device_write(struct file *, const char *, size_t, loff_t *);

#define SUCCESS 0
#define DEVICE_NAME "chardev" /* Dev name as it appears in /proc/devices */
#define BUF_LEN 80 /* Max length of the message from the device */

/*
* Global variables are declared as static, so are global within the file.
*/

static int Major; /* Major number assigned to our device driver */
static int Device_Open = 0; /* Is device open?
* Used to prevent multiple access to device */
static char msg[BUF_LEN]; /* The msg the device will give when asked */
static char *msg_Ptr;

static struct file_operations fops = {
.read = device_read,
.write = device_write,
.open = device_open,
.release = device_release
};

/*
* This function is called when the module is loaded
*/
int init_module(void)
{
Major = register_chrdev(0, DEVICE_NAME, &fops);

if (Major < 0) {
printk(KERN_ALERT "Registering char device failed with %d\n", Major);
return Major;
}

printk(KERN_INFO "I was assigned major number %d. To talk to\n", Major);
printk(KERN_INFO "the driver, create a dev file with\n");
printk(KERN_INFO "'mknod /dev/%s c %d 0'.\n", DEVICE_NAME, Major);
printk(KERN_INFO "Try various minor numbers. Try to cat and echo to\n");
printk(KERN_INFO "the device file.\n");
printk(KERN_INFO "Remove the device file and module when done.\n");

return SUCCESS;
}

/*
* This function is called when the module is unloaded
*/
void cleanup_module(void)
{
/*
* Unregister the device
*/
int ret = unregister_chrdev(Major, DEVICE_NAME);
if (ret < 0)
printk(KERN_ALERT "Error in unregister_chrdev: %d\n", ret);
}

/*
* Methods
*/

/*
* Called when a process tries to open the device file, like
* "cat /dev/mycharfile"
*/
static int device_open(struct inode *inode, struct file *file)
{
static int counter = 0;

if (Device_Open)
return -EBUSY;

Device_Open++;
sprintf(msg, "I already told you %d times Hello world!\n", counter++);
msg_Ptr = msg;
try_module_get(THIS_MODULE);

return SUCCESS;
}

/*
* Called when a process closes the device file.
*/
static int device_release(struct inode *inode, struct file *file)
{
Device_Open--; /* We're now ready for our next caller */

/*
* Decrement the usage count, or else once you opened the file, you'll
* never get get rid of the module.
*/
module_put(THIS_MODULE);

return 0;
}

/*
* Called when a process, which already opened the dev file, attempts to
* read from it.
*/
static ssize_t device_read(struct file *filp, /* see include/linux/fs.h */
char *buffer, /* buffer to fill with data */
size_t length, /* length of the buffer */
loff_t * offset)
{
/*
* Number of bytes actually written to the buffer
*/
int bytes_read = 0;

/*
* If we're at the end of the message,
* return 0 signifying end of file
*/
if (*msg_Ptr == 0)
return 0;

/*
* Actually put the data into the buffer
*/
while (length && *msg_Ptr) {

/*
* The buffer is in the user data segment, not the kernel
* segment so "*" assignment won't work. We have to use
* put_user which copies data from the kernel data segment to
* the user data segment.
*/
put_user(*(msg_Ptr++), buffer++);

length--;
bytes_read++;
}

/*
* Most read functions return the number of bytes put into the buffer
*/
return bytes_read;
}

/*
* Called when a process writes to dev file: echo "hi" > /dev/hello
*/
static ssize_t
device_write(struct file *filp, const char *buff, size_t len, loff_t * off)
{
printk(KERN_ALERT "Sorry, this operation isn't supported.\n");
return -EINVAL;
}

可以看到成功装载模块,major number506

使用mknod命令创建新设备驱动文件,读取内容,可以看到次数发生改变。无写入权限。

4.1.6 兼容多个版本的内核模块

内核暴露给进程的系统调用在各个版本之间基本一致。可能会加入新的系统调用,但是旧的调用的行为会保持不变(向后兼容)。不过有一些系统版本(奇数:开发版)会有改变。

为了适应不同版本,需要编写条件编译指令。将LINUX_VERSION_CODEKERNEL_VERSION进行比较。如果内核版本为a.b.c,宏的值应为$2^{16}a+2^{8}b+c$

Chapter 5. The /proc File System

5.1 The /proc File System

在Linux系统中,内核和内核模块有一种另外的方式与进程通信——/proc 文件系统。/proc文件系统最初的设计是为了方便访问进程信息,现在它被内核广泛调用来获取信息。

使用/proc文件系统的方法与设备驱动程序相似,使用/proc文件所需信息创建一个结构体,包括指向处理函数的指针。然后init_module注册该结构,cleanup_module注销。

为避免冲突,使用proc_register_dynamic来让内核决定inode number,与普通的文件系统不同,/proc文件系统位于内存中。在正常情况下,inode number是指向文件在磁盘位置的指针。inode包含文件的基本信息,例如权限,硬盘位置指针。

在以下代码中,打开/关闭文件并不会调用函数,因此不会放入try_module_gettry_module_put函数,如果文件被打开时模块被删除,那么会导致内存访问越界。

当使用proc_create函数加载模块时,将创建/proc/helloworld返回值是一个结构体struct proc_dir_entry,并且该结构体会用于配置/proc/helloworld,例如文件拥有者。若返回空,则表明创建失败。

每当文件/proc/helloworld被读取时,函数procfs_read会被调用。有两个参数比较重要buffer: 返回给读取文件应用进程的内容, offset: 当前文件的位置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
/*
* procfs1.c - create a "file" in /proc
*
*/

#include <linux/module.h> /* Specifically, a module */
#include <linux/kernel.h> /* We're doing kernel work */
#include <linux/proc_fs.h> /* Necessary because we use the proc fs */
#include <linux/sched.h>

static struct proc_dir_entry *proc_parent;

int len, temp;
char *msg;

static ssize_t read_proc(struct file *filp, char __user *buf, size_t count, loff_t *offp)
{
if(count > temp)
count = temp;
temp = temp-count;

raw_copy_to_user(buf, msg, count);
if(count == 0)
temp = len;
return count;
}

static struct proc_ops proc_fops = {
.proc_read = read_proc
};

void create_new_proc_entry(void)
{
/*create a file named world, and read attribute to this file using proc_fops*/
proc_create("world", 0, NULL, &proc_fops);
msg = "hello world\n"; /*file content*/
len = strlen(msg);
temp = len;
printk(KERN_INFO "1.len=%d", len);
printk(KERN_INFO "proc initialized");
}

int proc_init(void)
{
create_new_proc_entry();
return 0;
}

void proc_cleanup(void)
{
printk(KERN_INFO "Inside cleanup_module\n");
remove_proc_entry("world", NULL);
}

MODULE_LICENSE("GPL");
module_init(proc_init);
module_exit(proc_cleanup);

结果如下图所示:

5.2 读写/proc文件

文件写入使用copy_from_userget_user读取用户输入,与read不同。

使用copy_from_userget_user函数的原因为Linux系统内存是分段的,指针指向内存地址不是实际内存上特定地址,而是一个内存段上的地址。内核有一个内存段,其他每个进程都有一个内存段。

每个进程都只能访问他自己的内存段,因此当编写作为进程运行的一般程序时,不需要担心此问题。当编写内核模块时,一般情况下系统会自动让你访问内核的内存段。然而,当内存缓冲区的内容需要在进程与内核之间传递时,内核函数会接受一个指向进程内存段的指针。put_userget_user可以让你成功访问这些内存地址。不过这两个函数一次只能处理一个字符,可以使用copy_to_usercopy_from_user来处理字符串。由于buffer位于内和空间,因此对于写入操作而言你需要引入用户空间的数据,即只能使用copy_from_user函数,而读取操作因为数据本身就在内核态,可以不使用copy_to_user

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98

/**
* procfs2.c - create a "file" in /proc
*
*/

#include <linux/module.h> /* Specifically, a module */
#include <linux/kernel.h> /* We're doing kernel work */
#include <linux/proc_fs.h> /* Necessary because we use the proc fs */
#include <asm/uaccess.h> /* for copy_from_user */

#define PROCFS_MAX_SIZE 1024
#define PROCFS_NAME "bufferlk"


/**
* The buffer used to store character for this module
*
*/
static char procfs_buffer[PROCFS_MAX_SIZE];
/**
* The size of the buffer
*
*/
static unsigned long procfs_buffer_size = 0;
/**
* This function is called then the /proc file is read
*
*/
static ssize_t read_proc(struct file *filp, char __user *buf, size_t count, loff_t *offp)
{
int ret;
printk(KERN_INFO "procfile_read (/proc/%s) called\n", PROCFS_NAME);
printk(KERN_INFO "Read data: %s", procfs_buffer);
if (*offp > 0) {
/* we have finished to read, return 0 */
ret = 0;
} else {
/* fill the buffer, return the buffer size*/
copy_to_user(buf, procfs_buffer, procfs_buffer_size);
}
return ret;
}

/**
* This function is called with the /proc file is written
*
*/
static ssize_t write_proc(struct file *file, const char __user *buffer, size_t count,
loff_t *offp)
{
/* get buffer size */
procfs_buffer_size = count;
if (procfs_buffer_size > PROCFS_MAX_SIZE ) {
procfs_buffer_size = PROCFS_MAX_SIZE;
}

/* write data to the buffer */

if ( copy_from_user(procfs_buffer, buffer, procfs_buffer_size) ) {
return -EFAULT;
}
printk(KERN_INFO "Write data: %s", procfs_buffer);

return procfs_buffer_size;
}

static struct proc_ops proc_fops = {
.proc_read = read_proc,
.proc_write = write_proc,
};

/**
*This function is called when the module is loaded
*
*/
int init_module()
{
/* create the /proc file */
proc_create(PROCFS_NAME, 0644, NULL, &proc_fops);


printk(KERN_INFO "/proc/%s created\n", PROCFS_NAME);
return 0; /* everything is ok */
}

/**
*This function is called when the module is unloaded
*
*/
void cleanup_module()
{
remove_proc_entry(PROCFS_NAME, NULL);
printk(KERN_INFO "/proc/%s removed\n", PROCFS_NAME);
}

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Rita_Roseweisse");

写入并读取内容

1
2
❯ sudo sh -c 'echo "I LOVE YOU" > /proc/bufferlk'
❯ cat /proc/bufferlk

5.3 Manage /proc file with standard filesystem

还可以使用inode来管理/proc文件,好处是可以使用一些高级函数,例如权限。

Linux系统中,对于文件系统注册有一套标准的方法。因为每一个文件系统都有它自己的操作函数来管理inode和文件操作,需要一个结构体来指向这些函数,结构体struct inode_operations,该结构体包含proc_ops

文件操作与inode操作的区别在于,文件操作处理文件本身,而inode操作处理引用文件的方式,例如创建到该文件的链接。

还有个函数module_permission。当进程要对/proc文件进行操作时会先调用此函数,查看是否有权限来进行操作。

注:内核的read和write函数的作用与正常程序相反,read函数用于输出,write函数用于输入。因为如果进程要从内核中读取数据,内核需要输出,写入数据时,内核是接受数据。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
/* 
* procfs3.c
*/

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
#include <linux/sched.h>
#include <linux/uaccess.h>
#include <linux/version.h>
#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0)
#include <linux/minmax.h>
#endif

#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 6, 0)
#define HAVE_PROC_OPS
#endif

#define PROCFS_MAX_SIZE 2048UL
#define PROCFS_ENTRY_FILENAME "buffer2k"

static struct proc_dir_entry *our_proc_file;
static char procfs_buffer[PROCFS_MAX_SIZE];
static unsigned long procfs_buffer_size = 0;

static ssize_t procfs_read(struct file *filp, char __user *buffer,
size_t length, loff_t *offset)
{
if (*offset || procfs_buffer_size == 0) {
pr_debug("procfs_read: END\n");
*offset = 0;
return 0;
}
procfs_buffer_size = min(procfs_buffer_size, length);
if (copy_to_user(buffer, procfs_buffer, procfs_buffer_size))
return -EFAULT;
*offset += procfs_buffer_size;

pr_debug("procfs_read: read %lu bytes\n", procfs_buffer_size);
return procfs_buffer_size;
}
static ssize_t procfs_write(struct file *file, const char __user *buffer,
size_t len, loff_t *off)
{
procfs_buffer_size = min(PROCFS_MAX_SIZE, len);
if (copy_from_user(procfs_buffer, buffer, procfs_buffer_size))
return -EFAULT;
*off += procfs_buffer_size;

pr_debug("procfs_write: write %lu bytes\n", procfs_buffer_size);
return procfs_buffer_size;
}
static int procfs_open(struct inode *inode, struct file *file)
{
try_module_get(THIS_MODULE);
return 0;
}
static int procfs_close(struct inode *inode, struct file *file)
{
module_put(THIS_MODULE);
return 0;
}

#ifdef HAVE_PROC_OPS
static struct proc_ops file_ops_4_our_proc_file = {
.proc_read = procfs_read,
.proc_write = procfs_write,
.proc_open = procfs_open,
.proc_release = procfs_close,
};
#else
static const struct file_operations file_ops_4_our_proc_file = {
.read = procfs_read,
.write = procfs_write,
.open = procfs_open,
.release = procfs_close,
};
#endif

static int __init procfs3_init(void)
{
our_proc_file = proc_create(PROCFS_ENTRY_FILENAME, 0644, NULL,
&file_ops_4_our_proc_file);
if (our_proc_file == NULL) {
pr_debug("Error: Could not initialize /proc/%s\n",
PROCFS_ENTRY_FILENAME);
return -ENOMEM;
}
proc_set_size(our_proc_file, 80);
proc_set_user(our_proc_file, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID);

pr_debug("/proc/%s created\n", PROCFS_ENTRY_FILENAME);
return 0;
}

static void __exit procfs3_exit(void)
{
remove_proc_entry(PROCFS_ENTRY_FILENAME, NULL);
pr_debug("/proc/%s removed\n", PROCFS_ENTRY_FILENAME);
}

module_init(procfs3_init);
module_exit(procfs3_exit);

MODULE_LICENSE("GPL");

5.4 Manage /proc file with seq_file

前面的构建/proc文件过程比较复杂。因此为了更方便的写/proc文件,提供了seq_fileapi用于格式化/proc文件。seq_file根据时序分为三个函数: start(), next(), stop()。当用户读取/proc文件时,seq_file会新建一个时序。

时序以start()为开始。如果返回值不为空,则继续调用next()函数。该函数为一个迭代器,可以遍历所有的数据。每次next()函数调用,都会调用show()函数。show()函数会在用户读取的缓冲区中写入数据值。当函数返回空值后结束循环。然后stop()函数被调用。stop()函数调用结束后,会继续调用start()函数,直到start()函数返回空值结束。

注:seq_fileproc_ops提供了基本的函数,例如seq_read, seq_lseek等。但是没有写入的函数。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
/* 
* procfs4.c - create a "file" in /proc
* This program uses the seq_file library to manage the /proc file.
*/

#include <linux/kernel.h> /* We are doing kernel work */
#include <linux/module.h> /* Specifically, a module */
#include <linux/proc_fs.h> /* Necessary because we use proc fs */
#include <linux/seq_file.h> /* for seq_file */
#include <linux/version.h>

#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 6, 0)
#define HAVE_PROC_OPS
#endif

#define PROC_NAME "iter"

/* This function is called at the beginning of a sequence.
* ie, when:
* - the /proc file is read (first time)
* - after the function stop (end of sequence)
*/
static void *my_seq_start(struct seq_file *s, loff_t *pos)
{
static unsigned long counter = 0;

/* beginning a new sequence? */
if (*pos == 0) {
/* yes => return a non null value to begin the sequence */
return &counter;
}

/* no => it is the end of the sequence, return end to stop reading */
*pos = 0;
return NULL;
}

/* This function is called after the beginning of a sequence.
* It is called until the return is NULL (this ends the sequence).
*/
static void *my_seq_next(struct seq_file *s, void *v, loff_t *pos)
{
unsigned long *tmp_v = (unsigned long *)v;
(*tmp_v)++;
(*pos)++;
return NULL;
}

/* This function is called at the end of a sequence. */
static void my_seq_stop(struct seq_file *s, void *v)
{
/* nothing to do, we use a static value in start() */
}

/* This function is called for each "step" of a sequence. */
static int my_seq_show(struct seq_file *s, void *v)
{
loff_t *spos = (loff_t *)v;

seq_printf(s, "%Ld\n", *spos);
return 0;
}

/* This structure gather "function" to manage the sequence */
static struct seq_operations my_seq_ops = {
.start = my_seq_start,
.next = my_seq_next,
.stop = my_seq_stop,
.show = my_seq_show,
};

/* This function is called when the /proc file is open. */
static int my_open(struct inode *inode, struct file *file)
{
return seq_open(file, &my_seq_ops);
};

/* This structure gather "function" that manage the /proc file */
#ifdef HAVE_PROC_OPS
static const struct proc_ops my_file_ops = {
.proc_open = my_open,
.proc_read = seq_read,
.proc_lseek = seq_lseek,
.proc_release = seq_release,
};
#else
static const struct file_operations my_file_ops = {
.open = my_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
};
#endif

static int __init procfs4_init(void)
{
struct proc_dir_entry *entry;

entry = proc_create(PROC_NAME, 0, NULL, &my_file_ops);
if (entry == NULL) {
pr_debug("Error: Could not initialize /proc/%s\n", PROC_NAME);
return -ENOMEM;
}

return 0;
}

static void __exit procfs4_exit(void)
{
remove_proc_entry(PROC_NAME, NULL);
pr_debug("/proc/%s removed\n", PROC_NAME);
}

module_init(procfs4_init);
module_exit(procfs4_exit);

MODULE_LICENSE("GPL");

Chapter 6 Interacting with your module

sysfs可以实现在用户空间通过在模块中读取或设置变量与运行的内核进行交互。查看系统的sysfs目录

1
ls -l /sys

可以在文件系统中以常规文件的形式为kobjects导出属性。Sysfs将文件I/O操作转发给为属性定义的方法,提供了提供了读写内核属性的方法。

一个属性的简单定义如下:

1
2
3
4
5
6
7
8
struct attribute { 
char *name;
struct module *owner;
umode_t mode;
};

int sysfs_create_file(struct kobject * kobj, const struct attribute * attr);
void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr);

例如,设备驱动模型定义了device_attribute

1
2
3
4
5
6
7
8
9
10
struct device_attribute { 
struct attribute attr;
ssize_t (*show)(struct device *dev, struct device_attribute *attr,
char *buf);
ssize_t (*store)(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count);
};

int device_create_file(struct device *, const struct device_attribute *);
void device_remove_file(struct device *, const struct device_attribute *);

为了读写attributes,show()和store()方法必须声明定义。一般情况下include/linux/sysfs.h提供了便捷的宏用来简化定义。

以下为一个hello world模块,通过sysfs实现了创建一个可以访问的变量。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
/* 
* hello-sysfs.c sysfs example
*/
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/kobject.h>
#include <linux/module.h>
#include <linux/string.h>
#include <linux/sysfs.h>

static struct kobject *mymodule;

/* the variable you want to be able to change */
static int myvariable = 0;

static ssize_t myvariable_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sprintf(buf, "%d\n", myvariable);
}

static ssize_t myvariable_store(struct kobject *kobj,
struct kobj_attribute *attr, char *buf,
size_t count)
{
sscanf(buf, "%du", &myvariable);
return count;
}

static struct kobj_attribute myvariable_attribute =
__ATTR(myvariable, 0660, myvariable_show, (void *)myvariable_store);

static int __init mymodule_init(void)
{
int error = 0;

pr_info("mymodule: initialised\n");

mymodule = kobject_create_and_add("mymodule", kernel_kobj);
if (!mymodule)
return -ENOMEM;

error = sysfs_create_file(mymodule, &myvariable_attribute.attr);
if (error) {
pr_info("failed to create the myvariable file "
"in /sys/kernel/mymodule\n");
}

return error;
}

static void __exit mymodule_exit(void)
{
pr_info("mymodule: Exit success\n");
kobject_put(mymodule);
}

module_init(mymodule_init);
module_exit(mymodule_exit);

MODULE_LICENSE("GPL");

效果图如下,成功实现对sys文件下模块的变量的读写。

以上情况下,使用了kobject来创建sysfs下的目录,并与其下的属性文件进行信息交互。kobject起初用于统一内核代码的简单方法,用于管理引用计数对象,目前时将设备驱动与sysfs接口联系在一起的粘合剂。

Chapter 7. Talking To Device Files

7.1 Talking to Device Files (write and IOCTLS)

设备文件(/dev目录包含了所有linux中使用的外部设备,但是不包含外部设备的驱动信息,外舍驱动在/kernel/drivers)代表了物理设备。大多数的设备既有输入也有输出,因此需要一些方法令内核中的设备驱动从进程中获取输出发送给设备,通过打开设备文件并写入实现。下面例子实现了device_write。

当然只有这些写入操作还不够,假设有场景需要串口与解调器通信,可以通过读写设备文件实现消息的接受与发送,但是应该如何与串口本身通信是一个问题,例如波特率等配置信息。

Unix系统提供了ioctl(Input Output Control)函数,每个设备都有自己的ioctl命令,可以读取(从进程发送信息到内核),写入(从内核返回信息到进程)等。

ioctl函数有三个参数:合适的设备文件描述符,ioctl数,参数(long型)可以强制转换来传递任何内容。可以传递结构体指针。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
/* 
* ioctl.c
*/
#include <linux/cdev.h>
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/ioctl.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/uaccess.h>

struct ioctl_arg {
unsigned int val;
};

/* Documentation/userspace-api/ioctl/ioctl-number.rst */
#define IOC_MAGIC '\x66'

#define IOCTL_VALSET _IOW(IOC_MAGIC, 0, struct ioctl_arg)
#define IOCTL_VALGET _IOR(IOC_MAGIC, 1, struct ioctl_arg)
#define IOCTL_VALGET_NUM _IOR(IOC_MAGIC, 2, int)
#define IOCTL_VALSET_NUM _IOW(IOC_MAGIC, 3, int)

#define IOCTL_VAL_MAXNR 3
#define DRIVER_NAME "ioctltest"

static unsigned int test_ioctl_major = 0;
static unsigned int num_of_dev = 1;
static struct cdev test_ioctl_cdev;
static int ioctl_num = 0;

struct test_ioctl_data {
unsigned char val;
rwlock_t lock;
};

static long test_ioctl_ioctl(struct file *filp, unsigned int cmd,
unsigned long arg)
{
struct test_ioctl_data *ioctl_data = filp->private_data;
int retval = 0;
unsigned char val;
struct ioctl_arg data;
memset(&data, 0, sizeof(data));

switch (cmd) {
case IOCTL_VALSET:
if (copy_from_user(&data, (int __user *)arg, sizeof(data))) {
retval = -EFAULT;
goto done;
}

pr_alert("IOCTL set val:%x .\n", data.val);
write_lock(&ioctl_data->lock);
ioctl_data->val = data.val;
write_unlock(&ioctl_data->lock);
break;

case IOCTL_VALGET:
read_lock(&ioctl_data->lock);
val = ioctl_data->val;
read_unlock(&ioctl_data->lock);
data.val = val;

if (copy_to_user((int __user *)arg, &data, sizeof(data))) {
retval = -EFAULT;
goto done;
}

break;

case IOCTL_VALGET_NUM:
retval = __put_user(ioctl_num, (int __user *)arg);
break;

case IOCTL_VALSET_NUM:
ioctl_num = arg;
break;

default:
retval = -ENOTTY;
}

done:
return retval;
}

static ssize_t test_ioctl_read(struct file *filp, char __user *buf,
size_t count, loff_t *f_pos)
{
struct test_ioctl_data *ioctl_data = filp->private_data;
unsigned char val;
int retval;
int i = 0;

read_lock(&ioctl_data->lock);
val = ioctl_data->val;
read_unlock(&ioctl_data->lock);

for (; i < count; i++) {
if (copy_to_user(&buf[i], &val, 1)) {
retval = -EFAULT;
goto out;
}
}

retval = count;
out:
return retval;
}

static int test_ioctl_close(struct inode *inode, struct file *filp)
{
pr_alert("%s call.\n", __func__);

if (filp->private_data) {
kfree(filp->private_data);
filp->private_data = NULL;
}

return 0;
}

static int test_ioctl_open(struct inode *inode, struct file *filp)
{
struct test_ioctl_data *ioctl_data;

pr_alert("%s call.\n", __func__);
ioctl_data = kmalloc(sizeof(struct test_ioctl_data), GFP_KERNEL);

if (ioctl_data == NULL)
return -ENOMEM;

rwlock_init(&ioctl_data->lock);
ioctl_data->val = 0xFF;
filp->private_data = ioctl_data;

return 0;
}

static struct file_operations fops = {
.owner = THIS_MODULE,
.open = test_ioctl_open,
.release = test_ioctl_close,
.read = test_ioctl_read,
.unlocked_ioctl = test_ioctl_ioctl,
};

static int __init ioctl_init(void)
{
dev_t dev;
int alloc_ret = -1;
int cdev_ret = -1;
alloc_ret = alloc_chrdev_region(&dev, 0, num_of_dev, DRIVER_NAME);

if (alloc_ret)
goto error;

test_ioctl_major = MAJOR(dev);
cdev_init(&test_ioctl_cdev, &fops);
cdev_ret = cdev_add(&test_ioctl_cdev, dev, num_of_dev);

if (cdev_ret)
goto error;

pr_alert("%s driver(major: %d) installed.\n", DRIVER_NAME,
test_ioctl_major);
return 0;
error:
if (cdev_ret == 0)
cdev_del(&test_ioctl_cdev);
if (alloc_ret == 0)
unregister_chrdev_region(dev, num_of_dev);
return -1;
}

static void __exit ioctl_exit(void)
{
dev_t dev = MKDEV(test_ioctl_major, 0);

cdev_del(&test_ioctl_cdev);
unregister_chrdev_region(dev, num_of_dev);
pr_alert("%s driver removed.\n", DRIVER_NAME);
}

module_init(ioctl_init);
module_exit(ioctl_exit);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("This is test_ioctl module");

test_ioctl_ioctl函数有一参数cmd,这个就是ioctl_numberioctl number编码了major number,ioctl类型,命令和参数。通常使用宏表示(_IO, _IOR, _IOW, _IOWR)。内核与程序都应包含ioctl的头文件。内核模块调用的头文件是chardev.h,程序调用为userspace_ioctl.c

另外,我们需要注意的是,对共享资源的并发访问将导致竞态条件。。解决方法是使用原子比较与交换(atomic Compare-And-Swap (CAS))。

一个完整的与device file通信的示例

char_dev2.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
/* 
* chardev2.c - Create an input/output character device
*/

#include <linux/atomic.h>
#include <linux/cdev.h>
#include <linux/delay.h>
#include <linux/device.h>
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/module.h> /* Specifically, a module */
#include <linux/printk.h>
#include <linux/types.h>
#include <linux/uaccess.h> /* for get_user and put_user */
#include <linux/version.h>

#include <asm/errno.h>

#include "chardev.h"
#define SUCCESS 0
#define DEVICE_NAME "char_dev"
#define BUF_LEN 80

enum {
CDEV_NOT_USED = 0,
CDEV_EXCLUSIVE_OPEN = 1,
};

/* Is the device open right now? Used to prevent concurrent access into
* the same device
*/
static atomic_t already_open = ATOMIC_INIT(CDEV_NOT_USED);

/* The message the device will give when asked */
static char message[BUF_LEN + 1];

static struct class *cls;

/* This is called whenever a process attempts to open the device file */
static int device_open(struct inode *inode, struct file *file)
{
pr_info("device_open(%p)\n", file);

try_module_get(THIS_MODULE);
return SUCCESS;
}

static int device_release(struct inode *inode, struct file *file)
{
pr_info("device_release(%p,%p)\n", inode, file);

module_put(THIS_MODULE);
return SUCCESS;
}

/* This function is called whenever a process which has already opened the
* device file attempts to read from it.
*/
static ssize_t device_read(struct file *file, /* see include/linux/fs.h */
char __user *buffer, /* buffer to be filled */
size_t length, /* length of the buffer */
loff_t *offset)
{
/* Number of bytes actually written to the buffer */
int bytes_read = 0;
/* How far did the process reading the message get? Useful if the message
* is larger than the size of the buffer we get to fill in device_read.
*/
const char *message_ptr = message;

if (!*(message_ptr + *offset)) { /* we are at the end of message */
*offset = 0; /* reset the offset */
return 0; /* signify end of file */
}

message_ptr += *offset;

/* Actually put the data into the buffer */
while (length && *message_ptr) {
/* Because the buffer is in the user data segment, not the kernel
* data segment, assignment would not work. Instead, we have to
* use put_user which copies data from the kernel data segment to
* the user data segment.
*/
put_user(*(message_ptr++), buffer++);
length--;
bytes_read++;
}

pr_info("Read %d bytes, %ld left\n", bytes_read, length);

*offset += bytes_read;

/* Read functions are supposed to return the number of bytes actually
* inserted into the buffer.
*/
return bytes_read;
}

/* called when somebody tries to write into our device file. */
static ssize_t device_write(struct file *file, const char __user *buffer,
size_t length, loff_t *offset)
{
int i;

pr_info("device_write(%p,%p,%ld)", file, buffer, length);

for (i = 0; i < length && i < BUF_LEN; i++)
get_user(message[i], buffer + i);

/* Again, return the number of input characters used. */
return i;
}

/* This function is called whenever a process tries to do an ioctl on our
* device file. We get two extra parameters (additional to the inode and file
* structures, which all device functions get): the number of the ioctl called
* and the parameter given to the ioctl function.
*
* If the ioctl is write or read/write (meaning output is returned to the
* calling process), the ioctl call returns the output of this function.
*/
static long
device_ioctl(struct file *file, /* ditto */
unsigned int ioctl_num, /* number and param for ioctl */
unsigned long ioctl_param)
{
int i;
long ret = SUCCESS;

/* We don't want to talk to two processes at the same time. */
if (atomic_cmpxchg(&already_open, CDEV_NOT_USED, CDEV_EXCLUSIVE_OPEN))
return -EBUSY;

/* Switch according to the ioctl called */
switch (ioctl_num) {
case IOCTL_SET_MSG: {
/* Receive a pointer to a message (in user space) and set that to
* be the device's message. Get the parameter given to ioctl by
* the process.
*/
char __user *tmp = (char __user *)ioctl_param;
char ch;

/* Find the length of the message */
get_user(ch, tmp);
for (i = 0; ch && i < BUF_LEN; i++, tmp++)
get_user(ch, tmp);

device_write(file, (char __user *)ioctl_param, i, NULL);
break;
}
case IOCTL_GET_MSG: {
loff_t offset = 0;

/* Give the current message to the calling process - the parameter
* we got is a pointer, fill it.
*/
i = device_read(file, (char __user *)ioctl_param, 99, &offset);

/* Put a zero at the end of the buffer, so it will be properly
* terminated.
*/
put_user('\0', (char __user *)ioctl_param + i);
break;
}
case IOCTL_GET_NTH_BYTE:
/* This ioctl is both input (ioctl_param) and output (the return
* value of this function).
*/
ret = (long)message[ioctl_param];
break;
}

/* We're now ready for our next caller */
atomic_set(&already_open, CDEV_NOT_USED);

return ret;
}

/* Module Declarations */

/* This structure will hold the functions to be called when a process does
* something to the device we created. Since a pointer to this structure
* is kept in the devices table, it can't be local to init_module. NULL is
* for unimplemented functions.
*/
static struct file_operations fops = {
.read = device_read,
.write = device_write,
.unlocked_ioctl = device_ioctl,
.open = device_open,
.release = device_release, /* a.k.a. close */
};

/* Initialize the module - Register the character device */
static int __init chardev2_init(void)
{
/* Register the character device (atleast try) */
int ret_val = register_chrdev(MAJOR_NUM, DEVICE_NAME, &fops);

/* Negative values signify an error */
if (ret_val < 0) {
pr_alert("%s failed with %d\n",
"Sorry, registering the character device ", ret_val);
return ret_val;
}

#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 4, 0)
cls = class_create(DEVICE_FILE_NAME);
#else
cls = class_create(THIS_MODULE, DEVICE_FILE_NAME);
#endif
device_create(cls, NULL, MKDEV(MAJOR_NUM, 0), NULL, DEVICE_FILE_NAME);

pr_info("Device created on /dev/%s\n", DEVICE_FILE_NAME);

return 0;
}

/* Cleanup - unregister the appropriate file from /proc */
static void __exit chardev2_exit(void)
{
device_destroy(cls, MKDEV(MAJOR_NUM, 0));
class_destroy(cls);

/* Unregister the device */
unregister_chrdev(MAJOR_NUM, DEVICE_NAME);
}

module_init(chardev2_init);
module_exit(chardev2_exit);

MODULE_LICENSE("GPL");

chardev.h

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
/* 
* chardev.h - the header file with the ioctl definitions.
*
* The declarations here have to be in a header file, because they need
* to be known both to the kernel module (in chardev2.c) and the process
* calling ioctl() (in userspace_ioctl.c).
*/

#ifndef CHARDEV_H
#define CHARDEV_H

#include <linux/ioctl.h>

/* The major device number. We can not rely on dynamic registration
* any more, because ioctls need to know it.
*/
#define MAJOR_NUM 100

/* Set the message of the device driver */
#define IOCTL_SET_MSG _IOW(MAJOR_NUM, 0, char *)
/* _IOW means that we are creating an ioctl command number for passing
* information from a user process to the kernel module.
*
* The first arguments, MAJOR_NUM, is the major device number we are using.
*
* The second argument is the number of the command (there could be several
* with different meanings).
*
* The third argument is the type we want to get from the process to the
* kernel.
*/

/* Get the message of the device driver */
#define IOCTL_GET_MSG _IOR(MAJOR_NUM, 1, char *)
/* This IOCTL is used for output, to get the message of the device driver.
* However, we still need the buffer to place the message in to be input,
* as it is allocated by the process.
*/

/* Get the n'th byte of the message */
#define IOCTL_GET_NTH_BYTE _IOWR(MAJOR_NUM, 2, int)
/* The IOCTL is used for both input and output. It receives from the user
* a number, n, and returns message[n].
*/

/* The name of the device file */
#define DEVICE_FILE_NAME "char_dev"
#define DEVICE_PATH "/dev/char_dev"

#endif

userspace_ioctl.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
/*  userspace_ioctl.c - the process to use ioctl's to control the kernel module 
*
* Until now we could have used cat for input and output. But now
* we need to do ioctl's, which require writing our own process.
*/

/* device specifics, such as ioctl numbers and the
* major device file. */
#include "../chardev.h"

#include <stdio.h> /* standard I/O */
#include <fcntl.h> /* open */
#include <unistd.h> /* close */
#include <stdlib.h> /* exit */
#include <sys/ioctl.h> /* ioctl */

/* Functions for the ioctl calls */

int ioctl_set_msg(int file_desc, char *message)
{
int ret_val;

ret_val = ioctl(file_desc, IOCTL_SET_MSG, message);

if (ret_val < 0) {
printf("ioctl_set_msg failed:%d\n", ret_val);
}

return ret_val;
}

int ioctl_get_msg(int file_desc)
{
int ret_val;
char message[100] = { 0 };

/* Warning - this is dangerous because we don't tell
* the kernel how far it's allowed to write, so it
* might overflow the buffer. In a real production
* program, we would have used two ioctls - one to tell
* the kernel the buffer length and another to give
* it the buffer to fill
*/
ret_val = ioctl(file_desc, IOCTL_GET_MSG, message);

if (ret_val < 0) {
printf("ioctl_get_msg failed:%d\n", ret_val);
}
printf("get_msg message:%s", message);

return ret_val;
}

int ioctl_get_nth_byte(int file_desc)
{
int i, c;

printf("get_nth_byte message:");

i = 0;
do {
c = ioctl(file_desc, IOCTL_GET_NTH_BYTE, i++);

if (c < 0) {
printf("\nioctl_get_nth_byte failed at the %d'th byte:\n", i);
return c;
}

putchar(c);
} while (c != 0);

return 0;
}

/* Main - Call the ioctl functions */
int main(void)
{
int file_desc, ret_val;
char *msg = "Message passed by ioctl\n";

file_desc = open(DEVICE_PATH, O_RDWR);
if (file_desc < 0) {
printf("Can't open device file: %s, error:%d\n", DEVICE_PATH,
file_desc);
exit(EXIT_FAILURE);
}

ret_val = ioctl_set_msg(file_desc, msg);
if (ret_val)
goto error;
ret_val = ioctl_get_nth_byte(file_desc);
if (ret_val)
goto error;
ret_val = ioctl_get_msg(file_desc);
if (ret_val)
goto error;

close(file_desc);
return 0;
error:
close(file_desc);
exit(EXIT_FAILURE);
}