QemuTimer Callback机制

前言

在做2021 HWS FastCP的时候,有这样一个场景.漏洞点是对CP_buffer的溢出读写,可以覆盖掉之后的QEMUTimer结构,自然想到劫持其的callback函数指针.

但又多想了一点,cp_timer的回调函数是在pci_FastCP_realize中调用timer_init_full注册(初始化)过的,说不定callback指针已经在另一个位置保存并作为之后回调使用,并不会调用该结构中的指针.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
struct FastCPState
{
PCIDevice_0 pdev;
MemoryRegion_0 mmio;
CP_state cp_state;
uint8_t handling;
uint32_t irq_status;
char CP_buffer[4096];
QEMUTimer_0 cp_timer;
};


struct QEMUTimer
{
int64_t expire_time;
QEMUTimerList_0 *timer_list;
QEMUTimerCB *cb;
void *opaque;
QEMUTimer_0 *next;
int attributes;
int scale;
};

虽然只需打个断点动调一下就能验证,但当我看到timer_init_full的实现,一脸震惊.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
void timer_init_full(QEMUTimer *ts,
QEMUTimerListGroup *timer_list_group, QEMUClockType type,
int scale, int attributes,
QEMUTimerCB *cb, void *opaque)
{
if (!timer_list_group) {
timer_list_group = &main_loop_tlg;
}
ts->timer_list = timer_list_group->tl[type];
ts->cb = cb;
ts->opaque = opaque;
ts->scale = scale;
ts->attributes = attributes;
ts->expire_time = -1;
}

想象中的注册函数肯定要将ts链入timer_list链表中,但这里却仅是将ts->timer_list指向了对应的timer_list.形成了下图的结构,那当时间到达,如何找到ts?在内存中搜索timer_list指针?肯定不可能.于是便开始看Qemu源码.

源码分析

最后发现没有那么玄乎,加入链表的操作是在timer_mod中完成的.从某个点切入来看源码就是会有无数个莫名其妙: “这里为什么要调这个?这个条件是什么意思?”,为了解决一连串的问题,完整的跟了一遍QemuTimer Callback机制的源码.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
/*util/qemu-timer.c*/

void timer_mod(QEMUTimer *ts, int64_t expire_time)
{
timer_mod_ns(ts, expire_time * ts->scale);
}

/* modify the current timer so that it will be fired when current_time
>= expire_time. The corresponding callback will be called. */
void timer_mod_ns(QEMUTimer *ts, int64_t expire_time)
{
QEMUTimerList *timer_list = ts->timer_list;
bool rearm;

qemu_mutex_lock(&timer_list->active_timers_lock);
//将ts从timer_list中移除(?什么时候加进来过,估计是防止二次加入)
timer_del_locked(timer_list, ts);
/*将ts根据expire_time插入到active_timer链表中,
active_timer是一个sorted list.
可以看出其是按照expire_time升序排列的.
返回值rearm代表插入后ts是否在头部,
即timer_list中最近的deadline改变.*/
rearm = timer_mod_ns_locked(timer_list, ts, expire_time);
qemu_mutex_unlock(&timer_list->active_timers_lock);

/*若timer_list中最近的deadline改变,调用
timerlist_rearm.原因后续解释*/
if (rearm) {
timerlist_rearm(timer_list);
}
}

static void timer_del_locked(QEMUTimerList *timer_list, QEMUTimer *ts)
{
QEMUTimer **pt, *t;

ts->expire_time = -1;
pt = &timer_list->active_timers;
for(;;) {
t = *pt;
if (!t)
break;
if (t == ts) {
qatomic_set(pt, t->next);
break;
}
pt = &t->next;
}
}

static bool timer_mod_ns_locked(QEMUTimerList *timer_list,
QEMUTimer *ts, int64_t expire_time)
{
QEMUTimer **pt, *t;

/* add the timer in the sorted list */
pt = &timer_list->active_timers;
for (;;) {
t = *pt;
if (!timer_expired_ns(t, expire_time)) {
break;
}
pt = &t->next;
}
ts->expire_time = MAX(expire_time, 0);
ts->next = *pt;
qatomic_set(pt, ts);

return pt == &timer_list->active_timers;
}

static void timerlist_rearm(QEMUTimerList *timer_list)
{
/* Interrupt execution to force deadline recalculation. */
if (icount_enabled() && timer_list->clock->type == QEMU_CLOCK_VIRTUAL) {
icount_start_warp_timer();
}
timerlist_notify(timer_list);
}

timerlist_rearm最终会调用qemu_notify_event函数,

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
/*softmmu/cpu-timers.c*/

void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
{
if (!icount_enabled() || type != QEMU_CLOCK_VIRTUAL) {
qemu_notify_event();
return;
}

if (qemu_in_vcpu_thread()) {
/*
* A CPU is currently running; kick it back out to the
* tcg_cpu_exec() loop so it will recalculate its
* icount deadline immediately.
*/
qemu_cpu_kick(current_cpu);
} else if (first_cpu) {
/*
* qemu_cpu_kick is not enough to kick a halted CPU out of
* qemu_tcg_wait_io_event. async_run_on_cpu, instead,
* causes cpu_thread_is_idle to return false. This way,
* handle_icount_deadline can run.
* If we have no CPUs at all for some reason, we don't
* need to do anything.
*/
async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
}
}

从qemu_notify_event注释中看出,其会迫使 main_loop_wait检查挂起的事件并退出,使得触发一次qemu_clock_run_all_timers.

1
2
3
4
5
6
7
8
9
10
11
12
/**
* qemu_notify_event: Force processing of pending events.
*
* Similar to signaling a condition variable, qemu_notify_event forces
* main_loop_wait to look at pending events and exit. The caller of
* main_loop_wait will usually call it again very soon, so qemu_notify_event
* also has the side effect of recalculating the sets of file descriptors
* that the main loop waits for.
*
* Calling qemu_notify_event is rarely necessary, because main loop
* services (bottom halves and timers) call it themselves.
*/

最终处理的调用链

1
2
3
4
5
6
7
8
9
main_loop_wait->
timerlistgroup_deadline_ns ->
qemu_soonest_timeout
os_host_main_loop_wait->
qemu_poll_ns->
ppoll
qemu_clock_run_all_timers->
timerlist_run_timers->
cb

先来分析一下main_loop_wait函数,该函数被主例程循环调用,使用poll(ppoll)监听描述符事件并将结果Dispatch到对应的notifier进行处理.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
/*util/main-loop.c*/
void main_loop_wait(int nonblocking)
{
//设置poll参数,timeout = UINT32_MAX即无限阻塞等待
MainLoopPoll mlpoll = {
.state = MAIN_LOOP_POLL_FILL,
.timeout = UINT32_MAX,
.pollfds = gpollfds,
};
int ret;
int64_t timeout_ns;

if (nonblocking) {
mlpoll.timeout = 0;
}

/* poll any events */
g_array_set_size(gpollfds, 0); /* reset for new iteration */
/* XXX: separate device handlers from system ones */

//调用main_loop_poll_notifiers中所有notifier的notify函数
notifier_list_notify(&main_loop_poll_notifiers, &mlpoll);

if (mlpoll.timeout == UINT32_MAX) {
//这里其实是一个32位的-1到64位-1的转换
timeout_ns = -1;
} else {
timeout_ns = (uint64_t)mlpoll.timeout * (int64_t)(SCALE_MS);
}

/*计算timerlistgroup中最近的deadline,
若小于当前timeout_ns,更新timeout_ns为deadline.*/
timeout_ns = qemu_soonest_timeout(timeout_ns,
timerlistgroup_deadline_ns(
&main_loop_tlg));

//以timeout=timeout_ns进行poll(ppoll)阻塞
ret = os_host_main_loop_wait(timeout_ns);

mlpoll.state = ret < 0 ? MAIN_LOOP_POLL_ERR : MAIN_LOOP_POLL_OK;

/*再次调用main_loop_poll_notifiers中所有
notifier的notify函数告知其检查这次poll的结果.*/
notifier_list_notify(&main_loop_poll_notifiers, &mlpoll);

if (icount_enabled()) {
/*
* CPU thread can infinitely wait for event after
* missing the warp
*/
icount_start_warp_timer();
}

//可能有timer超时了,进行回调.
qemu_clock_run_all_timers();
}

main_loop_wait每次计算main_loop_tlg中最近的deadline并进行阻塞,每次结束阻塞尝试qemu_clock_run_all_timers进行回调(若阻塞结束是因为超时,则应该进行timer回调,若是因为某个事件,仍然调用了qemu_clock_run_all_timers但并不会起作用.

不过有个特殊情况,os_host_main_loop_wait时最大阻塞时间使用的是进入阻塞之前计算出的deadline,当阻塞时间结束,意味着到达deadline,正好进行timer的callback.但如果在阻塞期间deadline提前了,继续阻塞等待之前的deadline可能导致新的deadline到达而未进行回调,产生错误.

这就是为什么在timer_mod_ns函数中,当deadline提前时需要调用timerlist_rearm(最终是qemu_notify_event)的原因:使os_host_main_loop_wait立刻返回并重新设置deadline.

当然也解释了这个对qemu_clock_run_all_timers看似无意义的调用.

若阻塞结束是因为超时,则应该进行timer回调,若是因为某个事件,仍然调用了qemu_clock_run_all_timers但并不会起作用.

最终确认QemuTimer上的callback函数会被调用.且参数可控.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
/*util/qemu-timer.c*/

bool timerlist_run_timers(QEMUTimerList *timer_list)
{
QEMUTimer *ts;
int64_t current_time;
bool progress = false;
QEMUTimerCB *cb;
void *opaque;

if (!qatomic_read(&timer_list->active_timers)) {
return false;
}

qemu_event_reset(&timer_list->timers_done_ev);
if (!timer_list->clock->enabled) {
goto out;
}

switch (timer_list->clock->type) {
case QEMU_CLOCK_REALTIME:
break;
default:
case QEMU_CLOCK_VIRTUAL:
break;
case QEMU_CLOCK_HOST:
if (!replay_checkpoint(CHECKPOINT_CLOCK_HOST)) {
goto out;
}
break;
case QEMU_CLOCK_VIRTUAL_RT:
if (!replay_checkpoint(CHECKPOINT_CLOCK_VIRTUAL_RT)) {
goto out;
}
break;
}

/*
* Extract expired timers from active timers list and process them.
*
* In rr mode we need "filtered" checkpointing for virtual clock. The
* checkpoint must be recorded/replayed before processing any non-EXTERNAL timer,
* and that must only be done once since the clock value stays the same. Because
* non-EXTERNAL timers may appear in the timers list while it being processed,
* the checkpoint can be issued at a time until no timers are left and we are
* done".
*/
current_time = qemu_clock_get_ns(timer_list->clock->type);
qemu_mutex_lock(&timer_list->active_timers_lock);
while ((ts = timer_list->active_timers)) {
if (!timer_expired_ns(ts, current_time)) {
/* No expired timers left. The checkpoint can be skipped
* if no timers fired or they were all external.
*/
break;
}
/* Checkpoint for virtual clock is redundant in cases where
* it's being triggered with only non-EXTERNAL timers, because
* these timers don't change guest state directly.
*/
if (replay_mode != REPLAY_MODE_NONE
&& timer_list->clock->type == QEMU_CLOCK_VIRTUAL
&& !(ts->attributes & QEMU_TIMER_ATTR_EXTERNAL)
&& !replay_checkpoint(CHECKPOINT_CLOCK_VIRTUAL)) {
qemu_mutex_unlock(&timer_list->active_timers_lock);
goto out;
}

/* remove timer from the list before calling the callback */
timer_list->active_timers = ts->next;
ts->next = NULL;
ts->expire_time = -1;
cb = ts->cb;
opaque = ts->opaque;

/* run the callback (the timer list can be modified) */
qemu_mutex_unlock(&timer_list->active_timers_lock);
cb(opaque);
qemu_mutex_lock(&timer_list->active_timers_lock);

progress = true;
}
qemu_mutex_unlock(&timer_list->active_timers_lock);

out:
qemu_event_set(&timer_list->timers_done_ev);
return progress;
}

两个有趣的东西

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
/*include/sysemu/cpu-timers.h*/

/* icount - Instruction Counter API */

/*
* icount enablement state:
*
* 0 = Disabled - Do not count executed instructions.
* 1 = Enabled - Fixed conversion of insn to ns via "shift" option
* 2 = Enabled - Runtime adaptive algorithm to compute shift
*/
#ifdef CONFIG_TCG
extern int use_icount;
#define icount_enabled() (use_icount)
#else
#define icount_enabled() 0
#endif

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
/**
* WITH_QEMU_LOCK_GUARD - Lock a lock object for scope
*
* @x: a lock object (currently one of QemuMutex, CoMutex, QemuSpin).
*
* This macro defines a lock scope such that entering the scope takes the lock
* and leaving the scope releases the lock. Return statements are allowed
* within the scope and release the lock. Break and continue statements leave
* the scope early and release the lock.
*
* WITH_QEMU_LOCK_GUARD(&mutex) {
* ...
* if (error) {
* return; <-- mutex is automatically unlocked
* }
*
* if (early_exit) {
* break; <-- leave this scope early
* }
* ...
* }
*/
#define WITH_QEMU_LOCK_GUARD(x) \
  • 版权声明: 本博客所有文章除特别声明外,著作权归作者所有。转载请注明出处!
  • Copyrights © 2022-2024 翰青HanQi

请我喝杯咖啡吧~

支付宝
微信