Linux内核 页表映射建立过程 -- ARM64 v5.0

以Linux v5.0分析.

注意参数或变量中的pgd,pud指的是表项, 为表示区分, 在分析过程中使用大写表示表, 小写表示表项.(如PGD表示全局页目录表.pgd表示全局目录表中的表项)

Question

  1. 内核如何适应MMU开启前后的访存机制差异?
  2. MMU开启之后, 虚拟地址到物理地址的映射是什么样的?

ARM64页表映射建立过程

head.S

相关宏分析

create_table_entry

在tbl中创建一个条目, 指向tbl的下一个物理页.

实际上就是创建virt虚拟地址对应的tbl中的下一级页表, 下一级页表使用当前tbl的下一页面的空间.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
* Macro to create a table entry to the next page.
*
* tbl: page table address
* virt: virtual address
* shift: #imm page table shift
* ptrs: #imm pointers per table page
*
* Preserves: virt
* Corrupts: ptrs, tmp1, tmp2
* Returns: tbl -> next level table page address
*/
.macro create_table_entry, tbl, virt, shift, ptrs, tmp1, tmp2
add \tmp1, \tbl, #PAGE_SIZE
phys_to_pte \tmp2, \tmp1
orr \tmp2, \tmp2, #PMD_TYPE_TABLE // address of next table and entry type
lsr \tmp1, \virt, #\shift
sub \ptrs, \ptrs, #1
and \tmp1, \tmp1, \ptrs // table index
str \tmp2, [\tbl, \tmp1, lsl #3]
add \tbl, \tbl, #PAGE_SIZE // next level table page
.endm

populate_entries

将rtbl指向的连续物理内存地址填充到tbl中下标在index到eindex之间的表项, 填充到每个表项中的物理地址增量为inc.

这些物理内存可能是下一级页表,也可能是普通页面(当tbl是PT).

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
/*
* Macro to populate page table entries, these entries can be pointers to the next level
* or last level entries pointing to physical memory.
*
* tbl: page table address
* rtbl: pointer to page table or physical memory
* index: start index to write
* eindex: end index to write - [index, eindex] written to
* flags: flags for pagetable entry to or in
* inc: increment to rtbl between each entry
* tmp1: temporary variable
*
* Preserves: tbl, eindex, flags, inc
* Corrupts: index, tmp1
* Returns: rtbl
*/
.macro populate_entries, tbl, rtbl, index, eindex, flags, inc, tmp1
.Lpe\@: phys_to_pte \tmp1, \rtbl
orr \tmp1, \tmp1, \flags // tmp1 = table entry
str \tmp1, [\tbl, \index, lsl #3]
add \rtbl, \rtbl, \inc // rtbl = pa next level
add \index, \index, #1
cmp \index, \eindex
b.ls .Lpe\@
.endm
compute_indices

根据一个虚拟地址范围[vstart,vend]以及额外所需的上一级页表条目数量count, 计算其在本级页表中的下标istart和iend, 以及计算额外所需的下一级页表数量count.

翻译一下还是很一目了然的.

1
2
3
4
iend = (vend >> shift) & (ptrs - 1)
iend += (count - 1) * ptrs
istart = (vstart >> shift) & (ptrs - 1)
count = iend - istart
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
/*
* Compute indices of table entries from virtual address range. If multiple entries
* were needed in the previous page table level then the next page table level is assumed
* to be composed of multiple pages. (This effectively scales the end index).
*
* vstart: virtual address of start of range
* vend: virtual address of end of range
* shift: shift used to transform virtual address into index
* ptrs: number of entries in page table
* istart: index in table corresponding to vstart
* iend: index in table corresponding to vend
* count: On entry: how many extra entries were required in previous level, scales
* our end index.
* On exit: returns how many extra entries required for next page table level
*
* Preserves: vstart, vend, shift, ptrs
* Returns: istart, iend, count
*/
.macro compute_indices, vstart, vend, shift, ptrs, istart, iend, count
lsr \iend, \vend, \shift
mov \istart, \ptrs
sub \istart, \istart, #1
and \iend, \iend, \istart // iend = (vend >> shift) & (ptrs - 1)
mov \istart, \ptrs
mul \istart, \istart, \count
add \iend, \iend, \istart // iend += (count - 1) * ptrs
// our entries span multiple tables

lsr \istart, \vstart, \shift
mov \count, \ptrs
sub \count, \count, #1
and \istart, \istart, \count

sub \count, \iend, \istart
.endm

map_memory

将[vstart,vend]的虚拟内存区域映射到从phys开始的物理内存区域. 过程是自顶而下建立各级页表, 各页表使用相邻的物理页.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44

/*
* Map memory for specified virtual address range. Each level of page table needed supports
* multiple entries. If a level requires n entries the next page table level is assumed to be
* formed from n pages.
*
* tbl: location of page table
* rtbl: address to be used for first level page table entry (typically tbl + PAGE_SIZE)
* vstart: start address to map
* vend: end address to map - we map [vstart, vend]
* flags: flags to use to map last level entries
* phys: physical address corresponding to vstart - physical memory is contiguous
* pgds: the number of pgd entries
*
* Temporaries: istart, iend, tmp, count, sv - these need to be different registers
* Preserves: vstart, vend, flags
* Corrupts: tbl, rtbl, istart, iend, tmp, count, sv
*/
.macro map_memory, tbl, rtbl, vstart, vend, flags, phys, pgds, istart, iend, tmp, count, sv
add \rtbl, \tbl, #PAGE_SIZE
mov \sv, \rtbl
mov \count, #0
compute_indices \vstart, \vend, #PGDIR_SHIFT, \pgds, \istart, \iend, \count
populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp
mov \tbl, \sv
mov \sv, \rtbl

#if SWAPPER_PGTABLE_LEVELS > 3
compute_indices \vstart, \vend, #PUD_SHIFT, #PTRS_PER_PUD, \istart, \iend, \count
populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp
mov \tbl, \sv
mov \sv, \rtbl
#endif

#if SWAPPER_PGTABLE_LEVELS > 2
compute_indices \vstart, \vend, #SWAPPER_TABLE_SHIFT, #PTRS_PER_PMD, \istart, \iend, \count
populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp
mov \tbl, \sv
#endif

compute_indices \vstart, \vend, #SWAPPER_BLOCK_SHIFT, #PTRS_PER_PTE, \istart, \iend, \count
bic \count, \phys, #SWAPPER_BLOCK_SIZE - 1
populate_entries \tbl, \count, \istart, \iend, \flags, #SWAPPER_BLOCK_SIZE, \tmp
.endm

相关过程分析

首先清空init_pg_dir避免脏缓存行的污染.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
/*
* Setup the initial page tables. We only setup the barest amount which is
* required to get the kernel running. The following sections are required:
* - identity mapping to enable the MMU (low address, TTBR0)
* - first few MB of the kernel linear mapping to jump to once the MMU has
* been enabled
*/
__create_page_tables:
mov x28, lr

/*
* Invalidate the init page tables to avoid potential dirty cache lines
* being evicted. Other page tables are allocated in rodata as part of
* the kernel image, and thus are clean to the PoC per the boot
* protocol.
*/
adrp x0, init_pg_dir
adrp x1, init_pg_end
sub x1, x1, x0
bl __inval_dcache_area

/*
* Clear the init page tables.
*/
adrp x0, init_pg_dir
adrp x1, init_pg_end
sub x1, x1, x0
1: stp xzr, xzr, [x0], #16
stp xzr, xzr, [x0], #16
stp xzr, xzr, [x0], #16
stp xzr, xzr, [x0], #16
subs x1, x1, #64
b.ne 1b

然后创建恒等映射(idmap_pg_dir), 将虚拟内存区域[__idmap_text_start,__idmap_text_end]映射到物理内存区域[__idmap_text_start,__idmap_text_end].
在创建恒等映射时, 如果VA_BITS太小且系统RAM位于很高的物理地址空间时, 就无法创建恒等映射. 所以需要扩展虚拟地址空间范围, 方法是:

  1. 若VA_BITS小于48, 在页目录表的上一级增加一级额外的页表, 此时idmap_pg_dir指向的位置是额外页表而不是页目录表.
  2. 若VA_BITS等于48, 那么只需增加页目录表中的条目数.
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    	/*
    * Create the identity mapping.
    */
    adrp x0, idmap_pg_dir
    adrp x3, __idmap_text_start // __pa(__idmap_text_start)

    #ifdef CONFIG_ARM64_USER_VA_BITS_52
    mrs_s x6, SYS_ID_AA64MMFR2_EL1
    and x6, x6, #(0xf << ID_AA64MMFR2_LVA_SHIFT)
    mov x5, #52
    cbnz x6, 1f
    #endif
    mov x5, #VA_BITS
    1:
    adr_l x6, vabits_user
    str x5, [x6]
    dmb sy
    dc ivac, x6 // Invalidate potentially stale cache line

    /*
    * VA_BITS may be too small to allow for an ID mapping to be created
    * that covers system RAM if that is located sufficiently high in the
    * physical address space. So for the ID map, use an extended virtual
    * range in that case, and configure an additional translation level
    * if needed.
    *
    * Calculate the maximum allowed value for TCR_EL1.T0SZ so that the
    * entire ID map region can be mapped. As T0SZ == (64 - #bits used),
    * this number conveniently equals the number of leading zeroes in
    * the physical address of __idmap_text_end.
    */
    adrp x5, __idmap_text_end
    clz x5, x5
    cmp x5, TCR_T0SZ(VA_BITS) // default T0SZ small enough?
    b.ge 1f // .. then skip VA range extension

    adr_l x6, idmap_t0sz
    str x5, [x6]
    dmb sy
    dc ivac, x6 // Invalidate potentially stale cache line

    #if (VA_BITS < 48)
    #define EXTRA_SHIFT (PGDIR_SHIFT + PAGE_SHIFT - 3)
    #define EXTRA_PTRS (1 << (PHYS_MASK_SHIFT - EXTRA_SHIFT))

    /*
    * If VA_BITS < 48, we have to configure an additional table level.
    * First, we have to verify our assumption that the current value of
    * VA_BITS was chosen such that all translation levels are fully
    * utilised, and that lowering T0SZ will always result in an additional
    * translation level to be configured.
    */
    #if VA_BITS != EXTRA_SHIFT
    #error "Mismatch between VA_BITS and page size/number of translation levels"
    #endif

    mov x4, EXTRA_PTRS
    create_table_entry x0, x3, EXTRA_SHIFT, x4, x5, x6
    #else
    /*
    * If VA_BITS == 48, we don't have to configure an additional
    * translation level, but the top-level table has more entries.
    */
    mov x4, #1 << (PHYS_MASK_SHIFT - PGDIR_SHIFT)
    str_l x4, idmap_ptrs_per_pgd, x5
    #endif
    1:
    ldr_l x4, idmap_ptrs_per_pgd
    mov x5, x3 // __pa(__idmap_text_start)
    adr_l x6, __idmap_text_end // __pa(__idmap_text_end)

    map_memory x0, x1, x3, x6, x7, x3, x4, x10, x11, x12, x13, x14

然后创建内核镜像映射(init_pg_end), 将虚拟内存区域[(KIMAGE_VADDR+TEXT_OFFSET+KASLR_OFFSET), (KIMAGE_VADDR+TEXT_OFFSET+KASLR_OFFSET+(_end-_text))]的区域映射到物理内存区域[_text,_end]

1
2
3
4
5
6
7
8
9
10
11
12
13
/*
* Map the kernel image (starting with PHYS_OFFSET).
*/
adrp x0, init_pg_dir
mov_q x5, KIMAGE_VADDR + TEXT_OFFSET // compile time __va(_text)
add x5, x5, x23 // add KASLR displacement
mov x4, PTRS_PER_PGD
adrp x6, _end // runtime __pa(_end)
adrp x3, _text // runtime __pa(_text)
sub x6, x6, x3 // _end - _text
add x6, x6, x5 // runtime __va(_end)

map_memory x0, x1, x5, x6, x7, x3, x4, x10, x11, x12, x13, x14

此时就可以开启MMU了, 将TTBR0指向idmap_pg_dir, TTBR1指向init_pg_dir.
由于物理地址必然是比较低的, 所以在做恒等映射时, 地址的高位必然是0, 则必然需要使用TTBR0.

ARM64处理器有两个页表基地址寄存器,一个是TTBR0,另外一个TTBR1。当虚拟地址的第63位为0时,选择TTBR0指向的页表;当虚拟地址的第63位为1时,选择TTBR1指向的页表

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
/*
* Enable the MMU.
*
* x0 = SCTLR_EL1 value for turning on the MMU.
* x1 = TTBR1_EL1 value
*
* Returns to the caller via x30/lr. This requires the caller to be covered
* by the .idmap.text section.
*
* Checks if the selected granule size is supported by the CPU.
* If it isn't, park the CPU
*/
ENTRY(__enable_mmu)
mrs x2, ID_AA64MMFR0_EL1
ubfx x2, x2, #ID_AA64MMFR0_TGRAN_SHIFT, 4
cmp x2, #ID_AA64MMFR0_TGRAN_SUPPORTED
b.ne __no_granule_support
update_early_cpu_boot_status 0, x2, x3
adrp x2, idmap_pg_dir
phys_to_ttbr x1, x1
phys_to_ttbr x2, x2
msr ttbr0_el1, x2 // load TTBR0
offset_ttbr1 x1
msr ttbr1_el1, x1 // load TTBR1
isb
msr sctlr_el1, x0
isb
/*
* Invalidate the local I-cache so that any instructions fetched
* speculatively from the PoC are discarded, since they may have
* been dynamically patched at the PoU.
*/
ic iallu
dsb nsh
isb
ret
ENDPROC(__enable_mmu)

但实际上, 在此前KASLR_OFFSET并没有经历初始化, 只有在建立完内核镜像映射后通过kaslr_early_init才进行初始化工作.

1
2
3
4
5
6
7
8
9
/*
* This routine will be executed with the kernel mapped at its default virtual
* address, and if it returns successfully, the kernel will be remapped, and
* start_kernel() will be executed from a randomized virtual offset. The
* relocation will result in all absolute references (e.g., static variables
* containing function pointers) to be reinitialized, and zero-initialized
* .bss variables will be reset to 0.
*/
u64 __init kaslr_early_init(u64 dt_phys)

所以需要回到__create_page_tables重新映射一次内核. 自此初始的内核映射正式完成.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
__primary_switch:
#ifdef CONFIG_RANDOMIZE_BASE
mov x19, x0 // preserve new SCTLR_EL1 value
mrs x20, sctlr_el1 // preserve old SCTLR_EL1 value
#endif

adrp x1, init_pg_dir
bl __enable_mmu
#ifdef CONFIG_RELOCATABLE
bl __relocate_kernel
#ifdef CONFIG_RANDOMIZE_BASE
ldr x8, =__primary_switched
adrp x0, __PHYS_OFFSET
blr x8

/*
* If we return here, we have a KASLR displacement in x23 which we need
* to take into account by discarding the current kernel mapping and
* creating a new one.
*/
pre_disable_mmu_workaround
msr sctlr_el1, x20 // disable the MMU
isb
bl __create_page_tables // recreate kernel mapping

tlbi vmalle1 // Remove any stale TLB entries
dsb nsh

msr sctlr_el1, x19 // re-enable the MMU
isb
ic iallu // flush instructions fetched
dsb nsh // via old mapping
isb

bl __relocate_kernel
#endif
#endif
ldr x8, =__primary_switched
adrp x0, __PHYS_OFFSET
br x8
ENDPROC(__primary_switch)

恒等映射:

内核镜像映射:

补充一个细节, 如何实现MMU开启前后的内核地址切换?
将要在MMU开启前及过渡阶段的代码和数据链接到正常低物理地址处, 过渡阶段部分会因为恒等映射的存在而正常运行.
建立完成后的代码链接到高地址的内核虚拟内存, 跳转到上面即完成地址的切换.

early_fixmap_init

early_fixmap_init在init_mm->PGD中建立FIXADDR区域映射的相关页表结构, 此时还未做实际映射.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
/*
* Here we define all the compile-time 'special' virtual
* addresses. The point is to have a constant address at
* compile time, but to set the physical address only
* in the boot process.
*
* These 'compile-time allocated' memory buffers are
* page-sized. Use set_fixmap(idx,phys) to associate
* physical memory with fixmap indices.
*
*/
enum fixed_addresses {
FIX_HOLE,

/*
* Reserve a virtual window for the FDT that is 2 MB larger than the
* maximum supported size, and put it at the top of the fixmap region.
* The additional space ensures that any FDT that does not exceed
* MAX_FDT_SIZE can be mapped regardless of whether it crosses any
* 2 MB alignment boundaries.
*
* Keep this at the top so it remains 2 MB aligned.
*/
#define FIX_FDT_SIZE (MAX_FDT_SIZE + SZ_2M)
FIX_FDT_END,
FIX_FDT = FIX_FDT_END + FIX_FDT_SIZE / PAGE_SIZE - 1,

FIX_EARLYCON_MEM_BASE,
FIX_TEXT_POKE0,

#ifdef CONFIG_ACPI_APEI_GHES
/* Used for GHES mapping from assorted contexts */
FIX_APEI_GHES_IRQ,
FIX_APEI_GHES_NMI,
#endif /* CONFIG_ACPI_APEI_GHES */

#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
FIX_ENTRY_TRAMP_DATA,
FIX_ENTRY_TRAMP_TEXT,
#define TRAMP_VALIAS (__fix_to_virt(FIX_ENTRY_TRAMP_TEXT))
#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */
__end_of_permanent_fixed_addresses,

/*
* Temporary boot-time mappings, used by early_ioremap(),
* before ioremap() is functional.
*/
#define NR_FIX_BTMAPS (SZ_256K / PAGE_SIZE)
#define FIX_BTMAPS_SLOTS 7
#define TOTAL_FIX_BTMAPS (NR_FIX_BTMAPS * FIX_BTMAPS_SLOTS)

FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
FIX_BTMAP_BEGIN = FIX_BTMAP_END + TOTAL_FIX_BTMAPS - 1,

/*
* Used for kernel page table creation, so unmapped memory may be used
* for tables.
*/
FIX_PTE,
FIX_PMD,
FIX_PUD,
FIX_PGD,

__end_of_fixed_addresses
};

在ARM64架构中, 此时的init_mm->PGD为init_pg_dir.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#define INIT_MM_CONTEXT(name)	\
.pgd = init_pg_dir,

/*
* For dynamically allocated mm_structs, there is a dynamically sized cpumask
* at the end of the structure, the size of which depends on the maximum CPU
* number the system can see. That way we allocate only as much memory for
* mm_cpumask() as needed for the hundreds, or thousands of processes that
* a system typically runs.
*
* Since there is only one init_mm in the entire system, keep it simple
* and size this cpu_bitmask to NR_CPUS.
*/
struct mm_struct init_mm = {
.mm_rb = RB_ROOT,
.pgd = swapper_pg_dir,
.mm_users = ATOMIC_INIT(2),
.mm_count = ATOMIC_INIT(1),
.mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
.page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
.arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
.user_ns = &init_user_ns,
.cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0},
INIT_MM_CONTEXT(init_mm)
};

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
<start_kenrel()-> setup_arch()->early_fixmap_init()>

/*
* The p*d_populate functions call virt_to_phys implicitly so they can't be used
* directly on kernel symbols (bm_p*d). This function is called too early to use
* lm_alias so __p*d_populate functions must be used to populate with the
* physical address from __pa_symbol.
*/
void __init early_fixmap_init(void)
{
pgd_t *pgdp, pgd;
pud_t *pudp;
pmd_t *pmdp;
unsigned long addr = FIXADDR_START;

pgdp = pgd_offset_k(addr); //获取addr对应的init_mm->PGD中pgd的虚拟地址.
pgd = READ_ONCE(*pgdp); // 获取addr对应的init->mm->PGD中pgd的值.
if (CONFIG_PGTABLE_LEVELS > 3 &&
!(pgd_none(pgd) || pgd_page_paddr(pgd) == __pa_symbol(bm_pud))) {
/*
* We only end up here if the kernel mapping and the fixmap
* share the top level pgd entry, which should only happen on
* 16k/4 levels configurations.
*/
BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
pudp = pud_offset_kimg(pgdp, addr);
} else {
// 若pgd为空, 使用bm_PUD的地址来填充该pgd
if (pgd_none(pgd))
__pgd_populate(pgdp, __pa_symbol(bm_pud), PUD_TYPE_TABLE);
pudp = fixmap_pud(addr); // 获取addr对应的PUD中的pud的虚拟地址.
}
//若pud为空, 使用bm_PMD的地址来填充该pud
if (pud_none(READ_ONCE(*pudp)))
__pud_populate(pudp, __pa_symbol(bm_pmd), PMD_TYPE_TABLE);
pmdp = fixmap_pmd(addr); //获取addr对应的PMD中的pmd的虚拟地址.
//使用bm_PTE的地址来填充该pmd
__pmd_populate(pmdp, __pa_symbol(bm_pte), PMD_TYPE_TABLE);

/*
* The boot-ioremap range spans multiple pmds, for which
* we are not prepared:
*/

// 检查boot-ioremap range是否跨越了多个pmd(PT). (此时仅准备了一个pmd, 也就是一个PT).
BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
!= (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT));

if ((pmdp != fixmap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)))
|| pmdp != fixmap_pmd(fix_to_virt(FIX_BTMAP_END))) {
WARN_ON(1);
pr_warn("pmdp %p != %p, %p\n",
pmdp, fixmap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)),
fixmap_pmd(fix_to_virt(FIX_BTMAP_END)));
pr_warn("fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n",
fix_to_virt(FIX_BTMAP_BEGIN));
pr_warn("fix_to_virt(FIX_BTMAP_END): %08lx\n",
fix_to_virt(FIX_BTMAP_END));

pr_warn("FIX_BTMAP_END: %d\n", FIX_BTMAP_END);
pr_warn("FIX_BTMAP_BEGIN: %d\n", FIX_BTMAP_BEGIN);
}
}

该FIXADDR区域用来做一些特定的映射(FDT……) 或是建立临时映射来对物理内存进行处理(如early_pgtable_alloc函数中使用其来将物理内存区域清零)..

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
static phys_addr_t __init early_pgtable_alloc(void)
{
phys_addr_t phys;
void *ptr;

phys = memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);

/*
* The FIX_{PGD,PUD,PMD} slots may be in active use, but the FIX_PTE
* slot will be free, so we can (ab)use the FIX_PTE slot to initialise
* any level of table.
*/
ptr = pte_set_fixmap(phys);

memset(ptr, 0, PAGE_SIZE);

/*
* Implicit barriers also ensure the zeroed page is visible to the page
* table walker
*/
pte_clear_fixmap();

return phys;
}

paging_init

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
<start_kenrel()-> setup_arch()->paging_init()>


/*
* paging_init() sets up the page tables, initialises the zone memory
* maps and sets up the zero page.
*/
void __init paging_init(void)
{
pgd_t *pgdp = pgd_set_fixmap(__pa_symbol(swapper_pg_dir));

map_kernel(pgdp);
map_mem(pgdp);

pgd_clear_fixmap();

cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
init_mm.pgd = swapper_pg_dir;

memblock_free(__pa_symbol(init_pg_dir),
__pa_symbol(init_pg_end) - __pa_symbol(init_pg_dir));

memblock_allow_resize();
}

首先使用pgd_set_fixmap将swapper_pg_dir对应的物理内存映射到FIXADDR区域:
计算出FIX_PGD对应的固定虚拟地址, 再映射,相关过程如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#define pgd_set_fixmap(addr)	((pgd_t *)set_fixmap_offset(FIX_PGD, addr))

#define set_fixmap_offset(idx, phys) \
__set_fixmap_offset(idx, phys, FIXMAP_PAGE_NORMAL)

/* Return a pointer with offset calculated */
#define __set_fixmap_offset(idx, phys, flags) \
({ \
unsigned long ________addr; \
__set_fixmap(idx, phys, flags); \
________addr = fix_to_virt(idx) + ((phys) & (PAGE_SIZE - 1)); \
________addr; \
})

void __set_fixmap(enum fixed_addresses idx,
phys_addr_t phys, pgprot_t flags)
{
unsigned long addr = __fix_to_virt(idx);
pte_t *ptep;

BUG_ON(idx <= FIX_HOLE || idx >= __end_of_fixed_addresses);

ptep = fixmap_pte(addr);

if (pgprot_val(flags)) {
set_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, flags));
} else {
pte_clear(&init_mm, addr, ptep);
flush_tlb_kernel_range(addr, addr+PAGE_SIZE);
}
}

#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))



笔者暂且认为映射swapper_pg_dir到FIXADDR是为了某种意义上的访问方便, 因为实际上swapper_pg_dir所在的物理内存已经在初始内存映射时映射到了init_pg_dir中, 可以直接访问对应虚拟内存来完成其的初始化.

下图中第一段为初始内核映射, 第二段为FIXADDR到swapper_pg_dir的映射, 可见第二段映射的物理地址是包含在第一段映射内的.

然后在swapper_pg_dir中进行内核映像完整映射及所以物理内存的线性映射, 然后切换到swapper_pg_dir.
init_pg_dir完成了其的使命, 通过memblock_free释放.

1
2
3
4
5
6
7
8
9
10
11
12
map_kernel(pgdp);
map_mem(pgdp);

pgd_clear_fixmap();

cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
init_mm.pgd = swapper_pg_dir;

memblock_free(__pa_symbol(init_pg_dir),
__pa_symbol(init_pg_end) - __pa_symbol(init_pg_dir));

memblock_allow_resize();

map_kernel(pgdp):对内核映像文件的各个块重新映射。在head.S文件中,我们对内核映像文件做了块映射,现在需要使用页机制来重新映射。
map_mem(pgdp):物理内存的线性映射。物理内存会全部线性映射到以PAGE_OFFSET开始的内核空间的虚拟地址,以加速内核访问内存。

  • 版权声明: 本博客所有文章除特别声明外,著作权归作者所有。转载请注明出处!
  • Copyrights © 2022-2024 翰青HanQi

请我喝杯咖啡吧~

支付宝
微信