只是阅读源码时的笔记..顺道复现了拖了很久的DirtyPipe.
源码分析
1 | const struct file_operations pipefifo_fops = { |
pipe的创建(pipe,pipe2)
pipe和pipe2的系统调用都转到do_pipe2处理.调用__do_pipe_flags完成pipe的创建,然后将文件描述符拷贝到用户,如果成功则调用fd_install使文件描述符生效.
1 |
|
fd_install 将当前任务的文件描述符表中fd的对应表项与该文件关联.
先从该任务的task_struct中获取打开文件表,再从打开文件表中获取到文件描述符表.
1 | /* |
__do_pipe_flags函数:
- 检查flags合法性
- create_pipe_files创建pipe文件
- 获取两个未用的文件描述符.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34static int __do_pipe_flags(int *fd, struct file **files, int flags)
{
int error;
int fdw, fdr;
if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE))
return -EINVAL;
error = create_pipe_files(files, flags);
if (error)
return error;
error = get_unused_fd_flags(flags);
if (error < 0)
goto err_read_pipe;
fdr = error;
error = get_unused_fd_flags(flags);
if (error < 0)
goto err_fdr;
fdw = error;
audit_fd_pair(fdr, fdw);
fd[0] = fdr;
fd[1] = fdw;
return 0;
err_fdr:
put_unused_fd(fdr);
err_read_pipe:
fput(files[0]);
fput(files[1]);
return error;
}
create_pipe_files函数
- get_pipe_inode分配inode及pipe本体(pipe_inode_info结构),完成初始化并将二者关联.
- alloc_file_pseudo分配一个虚拟文件并与管道的inode关联.
- 克隆该虚拟文件作为管道的另一端
- stream_open将文件设置为流文件(not seekable and don’t have notion of position)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92/**
* struct pipe_inode_info - a linux kernel pipe
* @mutex: mutex protecting the whole thing
* @rd_wait: reader wait point in case of empty pipe
* @wr_wait: writer wait point in case of full pipe
* @head: The point of buffer production
* @tail: The point of buffer consumption
* @note_loss: The next read() should insert a data-lost message
* @max_usage: The maximum number of slots that may be used in the ring
* @ring_size: total number of buffers (should be a power of 2)
* @nr_accounted: The amount this pipe accounts for in user->pipe_bufs
* @tmp_page: cached released page
* @readers: number of current readers of this pipe
* @writers: number of current writers of this pipe
* @files: number of struct file referring this pipe (protected by ->i_lock)
* @r_counter: reader counter
* @w_counter: writer counter
* @fasync_readers: reader side fasync
* @fasync_writers: writer side fasync
* @bufs: the circular array of pipe buffers
* @user: the user who created this pipe
* @watch_queue: If this pipe is a watch_queue, this is the stuff for that
**/
struct pipe_inode_info {
struct mutex mutex;
wait_queue_head_t rd_wait, wr_wait;
unsigned int head;
unsigned int tail;
unsigned int max_usage;
unsigned int ring_size;
bool note_loss;
unsigned int nr_accounted;
unsigned int readers;
unsigned int writers;
unsigned int files;
unsigned int r_counter;
unsigned int w_counter;
struct page *tmp_page;
struct fasync_struct *fasync_readers;
struct fasync_struct *fasync_writers;
struct pipe_buffer *bufs;
struct user_struct *user;
struct watch_queue *watch_queue;
};
int create_pipe_files(struct file **res, int flags)
{
struct inode *inode = get_pipe_inode();
struct file *f;
int error;
if (!inode)
return -ENFILE;
if (flags & O_NOTIFICATION_PIPE) {
error = watch_queue_init(inode->i_pipe);
if (error) {
free_pipe_info(inode->i_pipe);
iput(inode);
return error;
}
}
f = alloc_file_pseudo(inode, pipe_mnt, "",
O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)),
&pipefifo_fops);
if (IS_ERR(f)) {
free_pipe_info(inode->i_pipe);
iput(inode);
return PTR_ERR(f);
}
f->private_data = inode->i_pipe;
res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK),
&pipefifo_fops);
if (IS_ERR(res[0])) {
put_pipe_info(inode, inode->i_pipe);
fput(f);
return PTR_ERR(res[0]);
}
res[0]->private_data = inode->i_pipe;
res[1] = f;
stream_open(inode, res[0]);
stream_open(inode, res[1]);
return 0;
}
1 | /* |
get_pipe_inode函数:
- new_inode_pseudo分配虚拟的inode.
- alloc_pipe_info创建pipe本体pipe_inode_info
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39static struct inode * get_pipe_inode(void)
{
struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb);
struct pipe_inode_info *pipe;
if (!inode)
goto fail_inode;
inode->i_ino = get_next_ino();
pipe = alloc_pipe_info();
if (!pipe)
goto fail_iput;
inode->i_pipe = pipe;
pipe->files = 2;
pipe->readers = pipe->writers = 1;
inode->i_fop = &pipefifo_fops;
/*
* Mark the inode dirty from the very beginning,
* that way it will never be moved to the dirty
* list because "mark_inode_dirty()" will think
* that it already _is_ on the dirty list.
*/
inode->i_state = I_DIRTY;
inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
return inode;
fail_iput:
iput(inode);
fail_inode:
return NULL;
}
alloc_pipe_info函数.
- kzalloc分配pipe_inode_info的空间
- kcalloc分配pipe_buffer的空间(下面具体分析).
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47struct pipe_inode_info *alloc_pipe_info(void)
{
struct pipe_inode_info *pipe;
unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
struct user_struct *user = get_current_user();
unsigned long user_bufs;
unsigned int max_size = READ_ONCE(pipe_max_size);
pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT);
if (pipe == NULL)
goto out_free_uid;
if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE))
pipe_bufs = max_size >> PAGE_SHIFT;
user_bufs = account_pipe_buffers(user, 0, pipe_bufs);
if (too_many_pipe_buffers_soft(user_bufs) && pipe_is_unprivileged_user()) {
user_bufs = account_pipe_buffers(user, pipe_bufs, 1);
pipe_bufs = 1;
}
if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user())
goto out_revert_acct;
pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer),
GFP_KERNEL_ACCOUNT);
if (pipe->bufs) {
init_waitqueue_head(&pipe->rd_wait);
init_waitqueue_head(&pipe->wr_wait);
pipe->r_counter = pipe->w_counter = 1;
pipe->max_usage = pipe_bufs;
pipe->ring_size = pipe_bufs;
pipe->nr_accounted = pipe_bufs;
pipe->user = user;
mutex_init(&pipe->mutex);
return pipe;
}
out_revert_acct:
(void) account_pipe_buffers(user, pipe_bufs, 0);
kfree(pipe);
out_free_uid:
free_uid(user);
return NULL;
}
pipe_buffer的分配是一次性分配PIPE_DEF_BUFFERS(16)个.
1 | /** |
但可以通过以下调用链重新设置pipe缓冲区的总大小并alloc and copy,即pipe_buffer的数量.堆喷手段get.
1 | pipe_fcntl |
1 | /* |
pipe_write
比较长,分段分析.
用户数据是通过io向量来存的
1 | struct iov_iter { |
如果该pipe没有读者 (!pipe->readers)直接返回-EPIPE.
1 | static ssize_t |
注意这里的读者不是说read阻塞在该pipe上的任务数,而是以可读方式打开了该管道的计数,对于匿名管道来说readers和writers都为1.而对于有名管道fifo,则是通过fifo_open时的读写方式来增加计数.
1 | static struct inode * get_pipe_inode(void) |
如果当前pipe不为空(head!=tail),则尝试先将部分数据写入上次使用的buffer,注意这里需要该buffer有PIPE_BUF_FLAG_CAN_MERGE的标志.
1 | /* |
然后是正式的大循环写入
每轮循环:
- 如果!pipe->readers则返回-EPIPE;
- 为本次写入获取一张临时页面(pipe->tmp_pages),可能分配也可能使用上次失败留下或刚消耗完的.
- 插入到当前buffer->page中并拷贝用户数据.
- 如果pipe满了,直接返回(O_NONBLOCK)或唤醒rd_wait并加入wr_wait等待数据被消耗.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97for (;;) {
if (!pipe->readers) {
send_sig(SIGPIPE, current, 0);
if (!ret)
ret = -EPIPE;
break;
}
head = pipe->head;
if (!pipe_full(head, pipe->tail, pipe->max_usage)) {
unsigned int mask = pipe->ring_size - 1;
struct pipe_buffer *buf = &pipe->bufs[head & mask];
struct page *page = pipe->tmp_page;
int copied;
if (!page) {
page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
if (unlikely(!page)) {
ret = ret ? : -ENOMEM;
break;
}
pipe->tmp_page = page;
}
/* Allocate a slot in the ring in advance and attach an
* empty buffer. If we fault or otherwise fail to use
* it, either the reader will consume it or it'll still
* be there for the next write.
*/
spin_lock_irq(&pipe->rd_wait.lock);
head = pipe->head;
if (pipe_full(head, pipe->tail, pipe->max_usage)) {
spin_unlock_irq(&pipe->rd_wait.lock);
continue;
}
pipe->head = head + 1;
spin_unlock_irq(&pipe->rd_wait.lock);
/* Insert it into the buffer array */
buf = &pipe->bufs[head & mask];
buf->page = page;
buf->ops = &anon_pipe_buf_ops;
buf->offset = 0;
buf->len = 0;
if (is_packetized(filp))
buf->flags = PIPE_BUF_FLAG_PACKET;
else
buf->flags = PIPE_BUF_FLAG_CAN_MERGE;
pipe->tmp_page = NULL;
copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
if (!ret)
ret = -EFAULT;
break;
}
ret += copied;
buf->offset = 0;
buf->len = copied;
if (!iov_iter_count(from))
break;
}
if (!pipe_full(head, pipe->tail, pipe->max_usage))
continue;
/* Wait for buffer space to become available. */
if (filp->f_flags & O_NONBLOCK) {
if (!ret)
ret = -EAGAIN;
break;
}
if (signal_pending(current)) {
if (!ret)
ret = -ERESTARTSYS;
break;
}
/*
* We're going to release the pipe lock and wait for more
* space. We wake up any readers if necessary, and then
* after waiting we need to re-check whether the pipe
* become empty while we dropped the lock.
*/
__pipe_unlock(pipe);
if (was_empty) {
wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
}
wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe));
__pipe_lock(pipe);
was_empty = pipe_empty(pipe->head, pipe->tail);
wake_next_writer = true;
}
pipe_read
还是大循环的形式.将buf->page拷贝到io向量中后,若该page没有其他引用,将其作为pipe->tmp_page或直接释放.
1 | static ssize_t |
pipe_release
close掉pipe的两端即可释放.
1 | pipe_release |
1 | static int |
splice
splice直接完成管道与文件之间的数据传输,避免内核与用户之间的数据拷贝.
1 | SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, |
__do_splice获取并校验用户参数,管道一端不能设置偏移.
1 | static long __do_splice(struct file *in, loff_t __user *off_in, |
do_splice函数根据两端文件的性质进行dispatch.
1 | /* |
1 | do_splice |
splice_pipe_to_pipe函数
直接看主循环逻辑.分两种情况
- 还需要拷贝的长度大于当前ibuf的长度,则直接将该ibuf给obuf,并将ibuf->op置NULL,类似于移动语义. 这里没有将ibuf->page置空,直觉上会有问题,但会看一下pipe_write,只会用pipe->page替换buf->page后再进行拷贝,所以不会影响到obuf.除非是进行Merge,假设要merge到该ibuf,则该ibuf应该是head-1,又由于i_tail++的操作,此时tail应该是(head-1)+1 ==head.则pipe此时必定是空的,也就不会进行merge操作,排除merge到该buf的可能(说的可能不是很好理解,后面还有一次分析).
- 还需要拷贝的长度小于当前ibuf的长度,先调用pipe_buf_get将ibuf->page引用+1,该页面同时被ibuf和obuf使用.但两者的len,off,flags不同.注意这里需要清除PIPE_BUF_FLAG_CAN_MERGE标志,因为该页在ibuf中还存在可读数据,在outbuf中合并写入会覆盖掉原数据.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
struct pipe_inode_info *opipe,
size_t len, unsigned int flags)
{
......
do {
......
ibuf = &ipipe->bufs[i_tail & i_mask];
obuf = &opipe->bufs[o_head & o_mask];
if (len >= ibuf->len) {
/*
* Simply move the whole buffer from ipipe to opipe
*/
*obuf = *ibuf;
ibuf->ops = NULL;
i_tail++;
ipipe->tail = i_tail;
input_wakeup = true;
o_len = obuf->len;
o_head++;
opipe->head = o_head;
} else {
/*
* Get a reference to this pipe buffer,
* so we can copy the contents over.
*/
if (!pipe_buf_get(ipipe, ibuf)) {
if (ret == 0)
ret = -EFAULT;
break;
}
*obuf = *ibuf;
/*
* Don't inherit the gift and merge flags, we need to
* prevent multiple steals of this page.
*/
obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE;
obuf->len = len;
ibuf->offset += len;
ibuf->len -= len;
o_len = len;
o_head++;
opipe->head = o_head;
}
ret += o_len;
len -= o_len;
} while (len);
......
}
do_splice_from会调用到iter_file_splice_write.以io向量的形式拷贝pipe_buffer数据到文件中,没什么好分析的.
1 | ssize_t |
do_splice_to会调用到copy_page_to_iter_pipe进行实际一页数据的拷贝.
这里使用的方式还是共享页面,将该文件缓存页与obuf共享.
1 | static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes, |
共享页面的安全性分析
可以看到splice调用中大量使用共享页面的形式完成数据的”拷贝”.但这种方式在直观上给人不安全的感觉.
详细分析一下三处共享页面.
初始状态,page蓝色部分代表buffer中已有的数据,白色部分表示空闲空间,红色部分表示本次要splice发送的数据.
第一处
1 | static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, |
拷贝完后是这样的.此时ibuf虽然还持有page的指针,但由于buf->op已经被清空,无法对page进行释放等操作,这一点上是安全的.再来分析两侧对page的读写能力.ibuf端tail已经前移,不能再读取该页,同时head==tail,也不能通过merge操作再次写入该页. 即ibuf端已经完全失去page的访问能力,即使obuf端能通过merge的方式再次写入该页,不会对ibuf端造成任何影响.
当然ibuf的head可能大于tail,此时虽然能进行merge操作但无法merge到已共享的那张page,仍不具有对它的访问能力.
第二处
1 | /* |
还是先从释放等操作分析.这里由于只将ibuf->page中的部分数据发送了,所以ibuf需要继续持有该page.通过pipe_buf_get增加一次对page的引用,所以不会出现其中一端过早释放页面的情况.再来看读写能力,ibuf端可以继续正常读写(写是通过merge)该页.outbuf端由于清除了PIPE_BUF_FLAG_CAN_MERGE标志,只具有对该page的读能力.
ibuf端能写,obuf端能读,就有覆盖的风险,然而ibuf和obuf中独立的offset,len字段已经避免了这样的冲突(obuf端只能读红色区域,ibuf端只能写白色区域).
第三处
1 | off = i->iov_offset; |
首先有通过get_page增加页面引用,释放是安全的.
输入侧是file_cache,始终持有对该页面读的能力.
obuf侧可以读,但读收到obuf中offset,len字段的限制,安全.
但由于未清空PIPE_BUF_FLAG_CAN_MERGE位,obuf同时具有对该页面写的能力.
再来看读写的冲突.
file_cache读的范围是整张page,obuf写的范围是蓝色区域,明显存在冲突.对obuf的merge写能覆盖掉文件缓存.
CVE-2022-0847 DirtyPipe
DirtyPipe便是这个问题导致的.利用这个漏洞可以写入只读文件,如写入/etc/passwd或往suid的程序写入shellcode完成提权.
下面是一个简易的exp.
1 |
|