MIT6.828 Lab3

2023-10-03

学习笔记 / 操作系统 / 6.828

字数统计: 2.5k | 阅读时长≈ 11 分钟

Lec 7

一些虚拟内存实现的技巧,如延迟分配,写时复制等
https://pdos.csail.mit.edu/6.828/2018/lec/l-usingvm.pdf

Lab3 User Environments

PartA: User Environments and Exception Handling

Creating and Running Environments

创建并初始化envs,env_setup_vm函数为环境e建立一个专属的页目录表,由于在UTOP上的内核空间映射对每个环境都是相同的,所以可以直接拷贝kern_pgdir过来.

void
env_init(void)
{
	// Set up envs array
	// LAB 3: Your code here.
	struct Env* e;
	for(e = envs;e<envs+NENV;++e)
	{
		e->env_id = 0;
		e->env_status = ENV_FREE;
		e->env_link = (e==envs+NENV-1)?0:e+1;
	}
	env_free_list = envs;
	// Per-CPU part of the initialization
	env_init_percpu();
}

static int
env_setup_vm(struct Env *e)
{
	int i;
	struct PageInfo *p = NULL;

	// Allocate a page for the page directory
	if (!(p = page_alloc(ALLOC_ZERO)))
		return -E_NO_MEM;

	// LAB 3: Your code here.
	e->env_pgdir = page2kva(p);
	memcpy(e->env_pgdir,kern_pgdir,PGSIZE);
	p->pp_ref++;
	
	// UVPT maps the env's own page table read-only.
	// Permissions: kernel R, user R
	e->env_pgdir[PDX(UVPT)] = PADDR(e->env_pgdir) | PTE_P | PTE_U;

	return 0;
}

由于还没有文件系统,JOS采用将ELF文件直接链接到内核的方式来提供用户程序,load_icode函数按照binary文件中的ELF头和程序头,将binary映像加载到用户进程(以JOS的概念,进程应该改成环境)空间中,并建立用户页表的相应映射.

实现这一函数主要是熟悉ELF头和程序头各项的意义,其实在MBR加载内核映像的时候我们已经完成过相同的操作.注意页目录表的切换.


static void
load_icode(struct Env *e, uint8_t *binary)
{
	// LAB 3: Your code here.
	struct Proghdr *ph, *eph;
	struct Elf *elfhdr = (struct Elf *)binary;

	if(elfhdr->e_magic!=ELF_MAGIC)
		panic("invalid ELF format,magic is not correct\n");

	ph = (struct Proghdr *)((uint8_t*)elfhdr+elfhdr->e_phoff);
	eph = ph+elfhdr->e_phnum;

	//为使之后能直接使用memcpy,切换到用户页表.
	lcr3(PADDR(e->env_pgdir));

	for(;ph<eph;++ph)
	{
		if (ph->p_filesz > ph->p_memsz)
			panic("file size is great than memmory size\n");
		if(ph->p_type==ELF_PROG_LOAD)
		{
			region_alloc(e->env_pgdir,ph->p_va,ph->p_memsz);
			memcpy(ph->p_va,binary+ph->p_offset,ph->p_filesz);
			memset(ph->p_va+ph->p_filesz,0,ph->p_memsz-ph->p_filesz);
		}
	}

	e->env_tf.tf_eip = elfhdr->e_entry;
	// Now map one page for the program's initial stack
	// at virtual address USTACKTOP - PGSIZE.
	region_alloc(e->env_pgdir,USTACKTOP-PGSIZE,PGSIZE);

	lcr3(PADDR(kern_pgdir));

}

//
// Allocate len bytes of physical memory for environment env,
// and map it at virtual address va in the environment's address space.
// Does not zero or otherwise initialize the mapped pages in any way.
// Pages should be writable by user and kernel.
// Panic if any allocation attempt fails.
//
static void
region_alloc(struct Env *e, void *va, size_t len)
{
	// LAB 3: Your code here.
	// (But only if you need it for load_icode.)
	//
	// Hint: It is easier to use region_alloc if the caller can pass
	//   'va' and 'len' values that are not page-aligned.
	//   You should round va down, and round (va + len) up.
	//   (Watch out for corner-cases!)

	struct PageInfo* pp;
	void* end_va = ROUNDUP(va+len,PGSIZE);
	va = ROUNDDOWN(va,PGSIZE);
	for(;va<end_va;va += PGSIZE)
	{
		
		if((pp = page_alloc(ALLOC_ZERO))==0)
		{
			panic("region_alloc:out of memory");
		}
		page_insert(e->env_pgdir,pp,va,PTE_U|PTE_W|PTE_P);
	}
}

之后便可以创建用户环境并运行.

void
env_create(uint8_t *binary, enum EnvType type)
{
	// LAB 3: Your code here.
	struct Env* e;
	int ret = env_alloc(&e,0);
	if(ret<0)
		panic("env_alloc:%e",ret);
	e->env_type = type;
	load_icode(e,binary);
}

void
env_run(struct Env *e)
{
	// LAB 3: Your code here.
	if(curenv != NULL)
	{
		if(curenv->env_status==ENV_RUNNING)
			curenv->env_status=ENV_RUNNABLE;
	}
	curenv = e;
	e->env_status=ENV_RUNNING;
	e->env_runs++;
	lcr3(PADDR(e->env_pgdir));

	env_pop_tf(&e->env_tf);
}

env相关流程分析

创建与初始化

i386_init函数在mem_init函数中完成envs数组的内存分配并在kern_pgdir中完成映射.
调用env_init,初始化envs数组中的各env,并链入env_free_list等待分配,load新的带有用户段的GDT,设置LDT.

运行准备与运行

env_create函数创建一个针对特定二进制(ELF)文件的运行环境.先调用env_alloc函数分配一个env结构,设置env环境(包括各段选择子,这是切换到用户级权限的关键之一),并为env建立一个单独的页表,映射内核及该页表本身.
调用load_icode函数按ELF头和文件头记录的信息为ELF格式的可执行文件创建内存映像,设置Trapframe->tf_eip为该程序的入口点,为环境映射一个初始栈.
env_run函数设置相关运行信息,调用env_pop_tf函数用env->env_tf设置程序状态,开始以用户级权限运行程序.

Handling Interrupts and Exceptions

trapentry.S中以宏的方式形式定义了各中断处理例程的入口,以及所有traps共用的例程alltraps.
该函数按照Trapframe的结构为trap函数压栈准备参数,设置ds和es寄存器.然后调用trap函数且不再返回.

/* TRAPHANDLER defines a globally-visible function for handling a trap.
 * It pushes a trap number onto the stack, then jumps to _alltraps.
 * Use TRAPHANDLER for traps where the CPU automatically pushes an error code.
 *
 * You shouldn't call a TRAPHANDLER function from C, but you may
 * need to _declare_ one in C (for instance, to get a function pointer
 * during IDT setup).  You can declare the function with
 *   void NAME();
 * where NAME is the argument passed to TRAPHANDLER.
 */
#define TRAPHANDLER(name, num)						\
	.globl name;		/* define global symbol for 'name' */	\
	.type name, @function;	/* symbol type is function */		\
	.align 2;		/* align function definition */		\
	name:			/* function starts here */		\
	pushl $(num);							\
	jmp _alltraps

/* Use TRAPHANDLER_NOEC for traps where the CPU doesn't push an error code.
 * It pushes a 0 in place of the error code, so the trap frame has the same
 * format in either case.
 */
#define TRAPHANDLER_NOEC(name, num)					\
	.globl name;							\
	.type name, @function;						\
	.align 2;							\
	name:								\
	pushl $0;							\
	pushl $(num);							\
	jmp _alltraps

.text

TRAPHANDLER_NOEC(divide_handler, T_DIVIDE);
//......
//其他中断定义方式相同,这里省略了

/*
 * Lab 3: Your code here for _alltraps
 */
.global _alltraps
_alltraps:
	pushl %ds;
	pushl %es;
	pushal;

	movw $GD_KD,%ax;
	movw %ds,%ax;
	movw %es,%ax;

	pushl %esp;
	call trap;

下面是Trapframe结构的定义,由注释分为三个部分,
最下面的一部分仅在发生特权级切换的时候才压入.
下图中的Trapframe部分在发生异常时由处理器硬件压入.

struct PushRegs {
	/* registers as pushed by pusha */
	uint32_t reg_edi;
	uint32_t reg_esi;
	uint32_t reg_ebp;
	uint32_t reg_oesp;		/* Useless */
	uint32_t reg_ebx;
	uint32_t reg_edx;
	uint32_t reg_ecx;
	uint32_t reg_eax;
} __attribute__((packed));

struct Trapframe {
	struct PushRegs tf_regs;
	uint16_t tf_es;
	uint16_t tf_padding1;
	uint16_t tf_ds;
	uint16_t tf_padding2;
	uint32_t tf_trapno;
	/* below here defined by x86 hardware */
	uint32_t tf_err;
	uintptr_t tf_eip;
	uint16_t tf_cs;
	uint16_t tf_padding3;
	uint32_t tf_eflags;
	/* below here only when crossing rings, such as from user to kernel */
	uintptr_t tf_esp;
	uint16_t tf_ss;
	uint16_t tf_padding4;
} __attribute__((packed));

trap流程分析

i386_init中调用trap_init完成trap的初始化:将各中断处理例程(在本实现中是中断处理例程的入口点)挂接到IDT中,设置TSS段(存储了内核栈的地址信息)并挂接到GDT中,设置IDT和TSS.

当中断或异常发生,处理器根据中断向量号在IDT中寻找对应的中断处理例程,在进行特权级检查(中断处理例程DPL≤CPL≤中断门描述符DPL)后,若发生特权级转换,CPU从当前TSS段中取出内核栈的地址信息(ss,esp)并加载到ss,esp寄存器中(加载前临时保存原值).在内核栈中压入原栈地址,再压入CS和eip,跳转到中断处理例程.接下来由操作系统(而不是处理器)压入错误码(是否压入因异常而异)及其他寄存器信息,形成Trapframe结构.跳转到trap函数.

void
trap(struct Trapframe *tf)
{
	// The environment may have set DF and some versions
	// of GCC rely on DF being clear
	asm volatile("cld" ::: "cc");

	// Check that interrupts are disabled.  If this assertion
	// fails, DO NOT be tempted to fix it by inserting a "cli" in
	// the interrupt path.
	assert(!(read_eflags() & FL_IF));

	cprintf("Incoming TRAP frame at %p\n", tf);

	if ((tf->tf_cs & 3) == 3) {
		// Trapped from user mode.
		assert(curenv);

		// Copy trap frame (which is currently on the stack)
		// into 'curenv->env_tf', so that running the environment
		// will restart at the trap point.
		curenv->env_tf = *tf;
		// The trapframe on the stack should be ignored from here on.
		tf = &curenv->env_tf;
	}

	// Record that tf is the last real trapframe so
	// print_trapframe can print some additional information.
	last_tf = tf;

	// Dispatch based on what type of trap occurred
	trap_dispatch(tf);

	// Return to the current environment, which should be running.
	assert(curenv && curenv->env_status == ENV_RUNNING);
	env_run(curenv);
}

trap函数进一步根据中断向量号分发(trap_dispatch)到各真正的中断处理例程(本实现中IDT中保存的只是一个中断处理例程的entry).
完成异常处理后,调用env_run函数恢复原环境(进程).

Part B: Page Faults, Breakpoints Exceptions, and System Calls

The Breakpoint Exception

调试器执行的原理:临时替换断点处的1字节指令为int 3系统调用触发The Breakpoint Exception.在这里的实现中会调用内核监视器monitor,添加两个命令nextstep和continue,前者会修改eflags的FL_TF位使处理器开始单步执行,每次执行完成发出一个中断号为1的debug exception,在本实现中该异常处理例程同样会启动monitor.

The breakpoint exception, interrupt vector 3 (T_BRKPT), is normally used to allow debuggers to insert breakpoints in a program’s code by temporarily replacing the relevant program instruction with the special 1-byte int3 software interrupt instruction.

int
mon_nextstep(int argc, char **argv, struct Trapframe *tf)
{
	if(!tf)
	{
		panic("empty Trapframe");
	}
	
	cprintf("$rip: %p\n",tf->tf_eip);
	switch(tf->tf_trapno)
	{
		case T_BRKPT:
			tf->tf_eflags |= FL_TF;
			return -1;
		case T_DEBUG:
			if (tf->tf_eflags & FL_TF)
            	return -1;
		default:
			cprintf("nextstep(ni) can only called via int 3(breakpoint exception)\n");
	}
	return 0;
}

int
mon_continue(int argc, char **argv, struct Trapframe *tf)
{
	if(!tf)
	{
		panic("empty Trapframe");
	}

	if(tf->tf_trapno==T_DEBUG||tf->tf_trapno==T_BRKPT)
	{
		if (tf->tf_eflags & FL_TF) 
		{
            tf->tf_eflags &= ~FL_TF;
            return -1;
        }
	}	

	cprintf("continue can only called via breakpoint or debug exception!\n");
	return 0;
}

System calls 系统调用

注意权限位的检验采用(perm&(*pte))==perm的形式,想象一下交集便于理解这一操作.(一些类型转换使程序看起来很乱,ye..i know…)

int
user_mem_check(struct Env *env, const void *va, size_t len, int perm)
{
	// LAB 3: Your code here.
		uintptr_t start_va = ROUNDDOWN((uintptr_t)va,PGSIZE);
		uintptr_t end_va = ROUNDUP((uintptr_t)(va+len),PGSIZE);
		pte_t* pte;
		for(;start_va<end_va;start_va+=PGSIZE)
		{
			pte = pgdir_walk(env->env_pgdir,(void*)start_va,false);
			if(start_va>=ULIM||pte==NULL||!(*pte&PTE_P)||((perm&(*pte))!=perm))
			{
				user_mem_check_addr = start_va>(uintptr_t)va?start_va:(uintptr_t)va;
				return -E_FAULT;
			}
		}
	return 0;
}

syscall流程分析

用户进程调用sys_xxx函数(其实用户并不直接调用这样的函数,而是由更上一层的函数如cprintf之类的调用.~~这里的用户指的是使用JOS而不是开发JOS的开发者hh~~),向操作系统申请xxx的操作,sys_xxx函数调用syscall函数,syscall函数使用int 0x30触发中断.
(上述sys_xxx,syscall函数均为lib/syscall.c中定义的由用户进程使用的函数,而非kern/syscall.c中的内核使用的实现)

中断处理过程参考上面的trap流程分析.
最终被分发到内核的syscall函数,该函数通过调用相应的内核函数完成功能,并设置系统调用返回值,之后正常从中断返回到用户进程.

版权声明： 本博客所有文章除特别声明外，著作权归作者所有。转载请注明出处！