Linux内核 物理内存初始化过程 -- ARM64 v5.0

Question

  1. 内核如何获取物理内存相关信息?
  2. sparse模型中,mem_section, page的关系是怎么建立的? pfn_to_page和page_to_pfn的实现?
  3. node, zone, page的关系是怎么建立的?

Memblock 初始化

Memblock是在常规的内核内存分配器没有运行之前的早期启动阶段的内存区域管理方式.
Memblock 将系统内存视为一系列连续的区域集合。这些集合有几种类型:

memory - 描述内核可用的物理内存;这可能与系统中实际安装的物理内存有所不同,例如当使用 mem= 命令行参数限制内存时。
reserved - 描述已分配的区域。
physmap - 描述实际的物理内存,不论可能的限制;physmap 类型仅在某些架构上可用。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
/**
* DOC: memblock overview
*
* Memblock is a method of managing memory regions during the early
* boot period when the usual kernel memory allocators are not up and
* running.
*
* Memblock views the system memory as collections of contiguous
* regions. There are several types of these collections:
*
* * ``memory`` - describes the physical memory available to the
* kernel; this may differ from the actual physical memory installed
* in the system, for instance when the memory is restricted with
* ``mem=`` command line parameter
* * ``reserved`` - describes the regions that were allocated
* * ``physmap`` - describes the actual physical memory regardless of
* the possible restrictions; the ``physmap`` type is only available
* on some architectures.

相关数据结构定义:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
/**
* enum memblock_flags - definition of memory region attributes
* @MEMBLOCK_NONE: no special request
* @MEMBLOCK_HOTPLUG: hotpluggable region
* @MEMBLOCK_MIRROR: mirrored region
* @MEMBLOCK_NOMAP: don't add to kernel direct mapping
*/
enum memblock_flags {
MEMBLOCK_NONE = 0x0, /* No special request */
MEMBLOCK_HOTPLUG = 0x1, /* hotpluggable region */
MEMBLOCK_MIRROR = 0x2, /* mirrored region */
MEMBLOCK_NOMAP = 0x4, /* don't add to kernel direct mapping */
};

/**
* struct memblock_region - represents a memory region
* @base: physical address of the region
* @size: size of the region
* @flags: memory region attributes
* @nid: NUMA node id
*/
struct memblock_region {
phys_addr_t base;
phys_addr_t size;
enum memblock_flags flags;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
int nid;
#endif
};

/**
* struct memblock_type - collection of memory regions of certain type
* @cnt: number of regions
* @max: size of the allocated array
* @total_size: size of all regions
* @regions: array of regions
* @name: the memory type symbolic name
*/
struct memblock_type {
unsigned long cnt;
unsigned long max;
phys_addr_t total_size;
struct memblock_region *regions;
char *name;
};

/**
* struct memblock - memblock allocator metadata
* @bottom_up: is bottom up direction?
* @current_limit: physical address of the current allocation limit
* @memory: usabe memory regions
* @reserved: reserved memory regions
* @physmem: all physical memory
*/
struct memblock {
bool bottom_up; /* is bottom up direction? */
phys_addr_t current_limit;
struct memblock_type memory;
struct memblock_type reserved;
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
struct memblock_type physmem;
#endif
};

最上层的memblock结构是静态初始化的:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28

static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_RESERVED_REGIONS] __initdata_memblock;
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS] __initdata_memblock;
#endif

struct memblock memblock __initdata_memblock = {
.memory.regions = memblock_memory_init_regions,
.memory.cnt = 1, /* empty dummy entry */
.memory.max = INIT_MEMBLOCK_REGIONS,
.memory.name = "memory",

.reserved.regions = memblock_reserved_init_regions,
.reserved.cnt = 1, /* empty dummy entry */
.reserved.max = INIT_MEMBLOCK_RESERVED_REGIONS,
.reserved.name = "reserved",

#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
.physmem.regions = memblock_physmem_init_regions,
.physmem.cnt = 1, /* empty dummy entry */
.physmem.max = INIT_PHYSMEM_REGIONS,
.physmem.name = "physmem",
#endif

.bottom_up = false,
.current_limit = MEMBLOCK_ALLOC_ANYWHERE,
};

在setup阶段, 会扫描FDT获取物理内存信息, 通过memblock_add加入到memblock中.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
/**
* memblock_add - add new memblock region
* @base: base address of the new region
* @size: size of the new region
*
* Add new memblock region [@base, @base + @size) to the "memory"
* type. See memblock_add_range() description for mode details
*
* Return:
* 0 on success, -errno on failure.
*/
int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
{
phys_addr_t end = base + size - 1;

memblock_dbg("memblock_add: [%pa-%pa] %pF\n",
&base, &end, (void *)_RET_IP_);

return memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0);
}

然后在arm64_memblock_init整理并移除一些物理内存, 完成memblock的初始化.
之后便可以使用memblock相关的api分配释放物理内存.

bootmem_init

sparse模型主要初始化过程在bootmem_init中.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
void __init bootmem_init(void)
{
unsigned long min, max;

min = PFN_UP(memblock_start_of_DRAM());
max = PFN_DOWN(memblock_end_of_DRAM());

early_memtest(min << PAGE_SHIFT, max << PAGE_SHIFT);

max_pfn = max_low_pfn = max;

arm64_numa_init();
/*
* Sparsemem tries to allocate bootmem in memory_present(), so must be
* done after the fixed reservations.
*/
arm64_memory_present();

sparse_init();
zone_sizes_init(min, max);

memblock_dump_all();
}

arm64_memory_present

初始化mem_section数组.
遍历memblock中所有可用的物理内存区域, 调用memory_present.

1
2
3
4
5
6
7
8
9
10
11
static void __init arm64_memory_present(void)
{
struct memblock_region *reg;

for_each_memblock(memory, reg) {
int nid = memblock_get_region_node(reg);

memory_present(nid, memblock_region_memory_base_pfn(reg),
memblock_region_memory_end_pfn(reg));
}
}

memory_present函数在每个section->section_mem_map中记录节点信息.
这里补充一下CONFIG_SPARSEMEM_EXTREME.
在Sparse Memory内存模型中, mem_section可以使用静态的二维数组, 也可以动态的进行分配.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
/*
* Permanent SPARSEMEM data:
*
* 1) mem_section - memory sections, mem_map's for valid memory
*/
#ifdef CONFIG_SPARSEMEM_EXTREME
struct mem_section **mem_section;
#else
struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
____cacheline_internodealigned_in_smp;
#endif

#ifdef CONFIG_SPARSEMEM_EXTREME
#define SECTIONS_PER_ROOT (PAGE_SIZE / sizeof (struct mem_section))
#else
#define SECTIONS_PER_ROOT 1
#endif

#define SECTION_NR_TO_ROOT(sec) ((sec) / SECTIONS_PER_ROOT)
#define NR_SECTION_ROOTS DIV_ROUND_UP(NR_MEM_SECTIONS, SECTIONS_PER_ROOT)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
/* Record a memory area against a node. */
void __init memory_present(int nid, unsigned long start, unsigned long end)
{
unsigned long pfn;

#ifdef CONFIG_SPARSEMEM_EXTREME
if (unlikely(!mem_section)) {
unsigned long size, align;

size = sizeof(struct mem_section*) * NR_SECTION_ROOTS;
align = 1 << (INTERNODE_CACHE_SHIFT);
mem_section = memblock_alloc(size, align);
}
#endif

start &= PAGE_SECTION_MASK;
mminit_validate_memmodel_limits(&start, &end);
for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
unsigned long section = pfn_to_section_nr(pfn);
struct mem_section *ms;

sparse_index_init(section, nid);//为section对应的root分配空间.
set_section_nid(section, nid);//在section_to_node_table中记录section所属的node.

ms = __nr_to_section(section);
// 在boot阶段,使用section_mem_map字段来存储nid.
if (!ms->section_mem_map) {
ms->section_mem_map = sparse_encode_early_nid(nid) |
SECTION_IS_ONLINE;
section_mark_present(ms); //更新__highest_present_section_nr, 节约遍历时间。
}
}
}

pfn与section序号之间的转换关系如下:

1
2
3
4
5
6
7
8
9
static inline unsigned long pfn_to_section_nr(unsigned long pfn)
{
return pfn >> PFN_SECTION_SHIFT;
}
static inline unsigned long section_nr_to_pfn(unsigned long sec)
{
return sec << PFN_SECTION_SHIFT;
}

sparse_index_init函数为CONFIG_SPARSEMEM_EXTREME的mem_section分配下一级指针数组(即单个ROOT内部的指针数组).
set_section_nid在section_to_node_table表中记录section对应的nid, 方便找到某page所属的节点.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
static int __meminit sparse_index_init(unsigned long section_nr, int nid)
{
unsigned long root = SECTION_NR_TO_ROOT(section_nr);
struct mem_section *section;

if (mem_section[root])
return -EEXIST;

section = sparse_index_alloc(nid);
if (!section)
return -ENOMEM;

mem_section[root] = section;

return 0;
}

#ifdef NODE_NOT_IN_PAGE_FLAGS
/*
* If we did not store the node number in the page then we have to
* do a lookup in the section_to_node_table in order to find which
* node the page belongs to.
*/
#if MAX_NUMNODES <= 256
static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
#else
static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
#endif

int page_to_nid(const struct page *page)
{
return section_to_node_table[page_to_section(page)];
}
EXPORT_SYMBOL(page_to_nid);

static void set_section_nid(unsigned long section_nr, int nid)
{
section_to_node_table[section_nr] = nid;
}
#else /* !NODE_NOT_IN_PAGE_FLAGS */

/*
* During early boot, before section_mem_map is used for an actual
* mem_map, we use section_mem_map to store the section's NUMA
* node. This keeps us from having to use another data structure. The
* node information is cleared just before we store the real mem_map.
*/
static inline unsigned long sparse_encode_early_nid(int nid)
{
return (nid << SECTION_NID_SHIFT);
}

sparse_init

sparse_init为每个节点的每个section分配PAGES_PER_SECTION个struct page的空间和一个usemap结构.
并将所有struct page映射到vmemmap区域, 使得page结构在虚拟地址空间上连续, 便于访问和转换.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
/*
* Allocate the accumulated non-linear sections, allocate a mem_map
* for each and record the physical to section mapping.
*/
void __init sparse_init(void)
{
unsigned long pnum_begin = first_present_section_nr();
int nid_begin = sparse_early_nid(__nr_to_section(pnum_begin));
unsigned long pnum_end, map_count = 1;

/* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
set_pageblock_order();

for_each_present_section_nr(pnum_begin + 1, pnum_end) {
int nid = sparse_early_nid(__nr_to_section(pnum_end));

if (nid == nid_begin) {
map_count++;
continue;
}
/* Init node with sections in range [pnum_begin, pnum_end) */
sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
nid_begin = nid;
pnum_begin = pnum_end;
map_count = 1;
}
/* cover the last node */
sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
vmemmap_populate_print_last();
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
static void __meminit sparse_init_one_section(struct mem_section *ms,
unsigned long pnum, struct page *mem_map,
unsigned long *pageblock_bitmap)
{
ms->section_mem_map &= ~SECTION_MAP_MASK;
ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) |
SECTION_HAS_MEM_MAP;
ms->pageblock_flags = pageblock_bitmap;
}


/*
* Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end)
* And number of present sections in this node is map_count.
*/
static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
unsigned long pnum_end,
unsigned long map_count)
{
unsigned long pnum, usemap_longs, *usemap;
struct page *map;

usemap_longs = BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS);
usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
usemap_size() *
map_count);
if (!usemap) {
pr_err("%s: node[%d] usemap allocation failed", __func__, nid);
goto failed;
}

sparse_buffer_init(map_count * section_map_size(), nid);//为所有section预分配map的空间
for_each_present_section_nr(pnum_begin, pnum) {
if (pnum >= pnum_end)
break;

map = sparse_mem_map_populate(pnum, nid, NULL);// 分配一个map的空间, 并建立属于该section的vmemmap区域映射.
if (!map) {
pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
__func__, nid);
pnum_begin = pnum;
goto failed;
}
check_usemap_section_nr(nid, usemap);
sparse_init_one_section(__nr_to_section(pnum), pnum, map, usemap);
usemap += usemap_longs;
}
sparse_buffer_fini();
return;
failed:
/* We failed to allocate, mark all the following pnums as not present */
for_each_present_section_nr(pnum_begin, pnum) {
struct mem_section *ms;

if (pnum >= pnum_end)
break;
ms = __nr_to_section(pnum);
ms->section_mem_map = 0;
}
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51

struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid,
struct vmem_altmap *altmap)
{
unsigned long start;
unsigned long end;
struct page *map;

map = pfn_to_page(pnum * PAGES_PER_SECTION);
start = (unsigned long)map;
end = (unsigned long)(map + PAGES_PER_SECTION);

if (vmemmap_populate(start, end, nid, altmap))
return NULL;

return map;
}


int __meminit vmemmap_populate_basepages(unsigned long start,
unsigned long end, int node)
{
unsigned long addr = start;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;

for (; addr < end; addr += PAGE_SIZE) {
pgd = vmemmap_pgd_populate(addr, node);
if (!pgd)
return -ENOMEM;
p4d = vmemmap_p4d_populate(pgd, addr, node);
if (!p4d)
return -ENOMEM;
pud = vmemmap_pud_populate(p4d, addr, node);
if (!pud)
return -ENOMEM;
pmd = vmemmap_pmd_populate(pud, addr, node);
if (!pmd)
return -ENOMEM;
pte = vmemmap_pte_populate(pmd, addr, node);
if (!pte)
return -ENOMEM;
vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
}

return 0;
}

zone_sizes_init

以UMA为例.
首先计算出DMA32和NORMAL区的空洞大小存入zhole_size, 然后调用free_area_init_node.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41

static void __init zone_sizes_init(unsigned long min, unsigned long max)
{
struct memblock_region *reg;
unsigned long zone_size[MAX_NR_ZONES], zhole_size[MAX_NR_ZONES];
unsigned long max_dma = min;

memset(zone_size, 0, sizeof(zone_size));

/* 4GB maximum for 32-bit only capable devices */
#ifdef CONFIG_ZONE_DMA32
max_dma = PFN_DOWN(arm64_dma_phys_limit);
zone_size[ZONE_DMA32] = max_dma - min;
#endif
zone_size[ZONE_NORMAL] = max - max_dma;

memcpy(zhole_size, zone_size, sizeof(zhole_size));

for_each_memblock(memory, reg) {
unsigned long start = memblock_region_memory_base_pfn(reg);
unsigned long end = memblock_region_memory_end_pfn(reg);

if (start >= max)
continue;

#ifdef CONFIG_ZONE_DMA32
if (start < max_dma) {
unsigned long dma_end = min(end, max_dma);
zhole_size[ZONE_DMA32] -= dma_end - start;
}
#endif
if (end > max_dma) {
unsigned long normal_end = min(end, max);
unsigned long normal_start = max(start, max_dma);
zhole_size[ZONE_NORMAL] -= normal_end - normal_start;
}
}

free_area_init_node(0, zone_size, min, zhole_size);
}

  • 设置pgdat的一些字段
  • calculate_node_totalpages计算并设置pgdat节点中以及各区的spanned_pages(总共的页面数量, 包括空洞),present_pages数量(可用的页面数量).
  • 调用free_area_init_core.
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    void __init free_area_init_node(int nid, unsigned long *zones_size,
    unsigned long node_start_pfn,
    unsigned long *zholes_size)
    {
    pg_data_t *pgdat = NODE_DATA(nid);
    unsigned long start_pfn = 0;
    unsigned long end_pfn = 0;

    /* pg_data_t should be reset to zero when it's allocated */
    WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx);

    pgdat->node_id = nid;
    pgdat->node_start_pfn = node_start_pfn;
    pgdat->per_cpu_nodestats = NULL;
    #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
    get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
    pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
    (u64)start_pfn << PAGE_SHIFT,
    end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
    #else
    start_pfn = node_start_pfn;
    #endif
    calculate_node_totalpages(pgdat, start_pfn, end_pfn,
    zones_size, zholes_size);

    alloc_node_mem_map(pgdat);
    pgdat_set_deferred_range(pgdat);

    free_area_init_core(pgdat);
    }

calculate_node_totalpages函数的计算过程主要是由zone_spanned_pages_in_node和zone_absent_pages_in_node实现的, 从下图中可以轻易地推出前者的实现方式, 而后者的实现则是遍历memblock.memory的各region, 然后用zone_spanned_pages_in_node来减.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
static void __init calculate_node_totalpages(struct pglist_data *pgdat,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
unsigned long *zones_size,
unsigned long *zholes_size)
{
unsigned long realtotalpages = 0, totalpages = 0;
enum zone_type i;

for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
unsigned long zone_start_pfn, zone_end_pfn;
unsigned long size, real_size;

size = zone_spanned_pages_in_node(pgdat->node_id, i,
node_start_pfn,
node_end_pfn,
&zone_start_pfn,
&zone_end_pfn,
zones_size);
real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
node_start_pfn, node_end_pfn,
zholes_size);
if (size)
zone->zone_start_pfn = zone_start_pfn;
else
zone->zone_start_pfn = 0;
zone->spanned_pages = size;
zone->present_pages = real_size;

totalpages += size;
realtotalpages += real_size;
}

pgdat->node_spanned_pages = totalpages;
pgdat->node_present_pages = realtotalpages;
printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
realtotalpages);
}

/*
* Return the number of pages a zone spans in a node, including holes
* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
*/
static unsigned long __init zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
unsigned long *zone_start_pfn,
unsigned long *zone_end_pfn,
unsigned long *ignored)
{
/* When hotadd a new node from cpu_up(), the node should be empty */
if (!node_start_pfn && !node_end_pfn)
return 0;

/* Get the start and end of the zone */
*zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
*zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
adjust_zone_range_for_zone_movable(nid, zone_type,
node_start_pfn, node_end_pfn,
zone_start_pfn, zone_end_pfn);

/* Check that this node has pages within the zone's required range */
if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
return 0;

/* Move the zone boundaries inside the node if necessary */
*zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
*zone_start_pfn = max(*zone_start_pfn, node_start_pfn);

/* Return the spanned pages */
return *zone_end_pfn - *zone_start_pfn;
}


/*
* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
* then all holes in the requested range will be accounted for.
*/
unsigned long __init __absent_pages_in_range(int nid,
unsigned long range_start_pfn,
unsigned long range_end_pfn)
{
unsigned long nr_absent = range_end_pfn - range_start_pfn;
unsigned long start_pfn, end_pfn;
int i;

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
nr_absent -= end_pfn - start_pfn;
}
return nr_absent;
}

free_area_init_core函数:
遍历节点中所有的zone,

  • 初始化节点内部的list_head,spin_lock之类的数据结构.
  • 计算空闲的页面数量. 方式为zone->present_pages减去memmap所用struct page结构占的页面数, 再减去dma_reserve
  • 初始化zone内部的数据结构
  • 初始化zone->free_area
  • 初始化memmap的struct page,为每个页块设置MIGRATE_MOVABLE标志.
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    /*
    * Set up the zone data structures:
    * - mark all pages reserved
    * - mark all memory queues empty
    * - clear the memory bitmaps
    *
    * NOTE: pgdat should get zeroed by caller.
    * NOTE: this function is only called during early init.
    */
    static void __init free_area_init_core(struct pglist_data *pgdat)
    {
    enum zone_type j;
    int nid = pgdat->node_id;

    pgdat_init_internals(pgdat);
    pgdat->per_cpu_nodestats = &boot_nodestats;

    for (j = 0; j < MAX_NR_ZONES; j++) {
    struct zone *zone = pgdat->node_zones + j;
    unsigned long size, freesize, memmap_pages;
    unsigned long zone_start_pfn = zone->zone_start_pfn;

    size = zone->spanned_pages;
    freesize = zone->present_pages;

    /*
    * Adjust freesize so that it accounts for how much memory
    * is used by this zone for memmap. This affects the watermark
    * and per-cpu initialisations
    */
    memmap_pages = calc_memmap_size(size, freesize);
    if (!is_highmem_idx(j)) {
    if (freesize >= memmap_pages) {
    freesize -= memmap_pages;
    if (memmap_pages)
    printk(KERN_DEBUG
    " %s zone: %lu pages used for memmap\n",
    zone_names[j], memmap_pages);
    } else
    pr_warn(" %s zone: %lu pages exceeds freesize %lu\n",
    zone_names[j], memmap_pages, freesize);
    }

    /* Account for reserved pages */
    if (j == 0 && freesize > dma_reserve) {
    freesize -= dma_reserve;
    printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
    zone_names[0], dma_reserve);
    }

    if (!is_highmem_idx(j))
    nr_kernel_pages += freesize;
    /* Charge for highmem memmap if there are enough kernel pages */
    else if (nr_kernel_pages > memmap_pages * 2)
    nr_kernel_pages -= memmap_pages;
    nr_all_pages += freesize;

    /*
    * Set an approximate value for lowmem here, it will be adjusted
    * when the bootmem allocator frees pages into the buddy system.
    * And all highmem pages will be managed by the buddy system.
    */
    zone_init_internals(zone, j, nid, freesize);

    if (!size)
    continue;

    set_pageblock_order();
    setup_usemap(pgdat, zone, zone_start_pfn, size);
    init_currently_empty_zone(zone, zone_start_pfn, size);
    memmap_init(size, nid, j, zone_start_pfn);
    }

Answer

1. 内核如何获取物理内存相关信息?

1
2
3
start_kernel->
setup_arch->
setup_machine_fdt

从fdt中获取物理内存信息.

2. sparse模型中,mem_section, page的关系是怎么建立的? pfn_to_page和page_to_pfn的实现?

建立过程看上方的分析. 下面分析两个转换宏的实现.

在开启了CONFIG_SPARSEMEM_VMEMMAP的情况下, vmemmap区域会虚拟连续地映射所有的page结构, 所以转换非常简单.

1
2
3
4
5
#elif defined(CONFIG_SPARSEMEM_VMEMMAP)

/* memmap is virtually contiguous. */
#define __pfn_to_page(pfn) (vmemmap + (pfn))
#define __page_to_pfn(page) (unsigned long)((page) - vmemmap)

而在普通的CONFIG_SPARSEMEM中, 转换会复杂很多.
先来看__pfn_to_page.

  • __pfn_to_section得到section
    • pfn >> PFN_SECTION_SHIFT得到section_nr
    • mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK]得到对应的section
  • 将section->section_mem_map掩码后, 加上pfn得到对应的page. 这里的计算过程很反直觉, 因为掩码后得到的实际上是memmap的基地址减去memmap中第一个page的pfn.

然后是__page_to_pfn

  • 从page->flags中得到section_nr : (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK.
  • page地址减去section->section_mem_map掩码后的值得到pfn.
1
2
3
4
5
6
7
8
9
10
11
#define __page_to_pfn(pg)					\
({ const struct page *__pg = (pg); \
int __sec = page_to_section(__pg); \
(unsigned long)(__pg - __section_mem_map_addr(__nr_to_section(__sec))); \
})

#define __pfn_to_page(pfn) \
({ unsigned long __pfn = (pfn); \
struct mem_section *__sec = __pfn_to_section(__pfn); \
__section_mem_map_addr(__sec) + __pfn; \
})
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
static inline unsigned long pfn_to_section_nr(unsigned long pfn)
{
return pfn >> PFN_SECTION_SHIFT;
}
static inline unsigned long section_nr_to_pfn(unsigned long sec)
{
return sec << PFN_SECTION_SHIFT;
}


#define SECTION_NR_TO_ROOT(sec) ((sec) / SECTIONS_PER_ROOT)
#define NR_SECTION_ROOTS DIV_ROUND_UP(NR_MEM_SECTIONS, SECTIONS_PER_ROOT)
#define SECTION_ROOT_MASK (SECTIONS_PER_ROOT - 1)

#ifdef CONFIG_SPARSEMEM_EXTREME
extern struct mem_section **mem_section;
#else
extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT];
#endif

static inline struct mem_section *__nr_to_section(unsigned long nr)
{
#ifdef CONFIG_SPARSEMEM_EXTREME
if (!mem_section)
return NULL;
#endif
if (!mem_section[SECTION_NR_TO_ROOT(nr)])
return NULL;
return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK];
}

static inline struct mem_section *__pfn_to_section(unsigned long pfn)
{
return __nr_to_section(pfn_to_section_nr(pfn));
}

static inline struct page *__section_mem_map_addr(struct mem_section *section)
{
unsigned long map = section->section_mem_map;
map &= SECTION_MAP_MASK;
return (struct page *)map;
}

3. node, zone与page的关系是怎么建立的?

page中存有node,和zone的索引, zone本身在pgdat结构中, zone->pgdat在zone_init_internals时指向所属的node.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
<start_kernel->setup_arch()->bootmem_init()->zone_sizes_init()->memmap_init_zone()->__init_single_page()->set_page_links>

static inline void set_page_zone(struct page *page, enum zone_type zone)
{
page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT;
}

static inline void set_page_node(struct page *page, unsigned long node)
{
page->flags &= ~(NODES_MASK << NODES_PGSHIFT);
page->flags |= (node & NODES_MASK) << NODES_PGSHIFT;
}

static inline void set_page_links(struct page *page, enum zone_type zone,
unsigned long node, unsigned long pfn)
{
set_page_zone(page, zone);
set_page_node(page, node);
#ifdef SECTION_IN_PAGE_FLAGS
set_page_section(page, pfn_to_section_nr(pfn));
#endif
}

参考文章

https://www.cnblogs.com/LoyenWang/p/11523678.html
https://www.cnblogs.com/LoyenWang/p/11568481.html

  • 版权声明: 本博客所有文章除特别声明外,著作权归作者所有。转载请注明出处!
  • Copyrights © 2022-2024 翰青HanQi

请我喝杯咖啡吧~

支付宝
微信