Linux内核 BuddySystem -- ARM64 v5.0

ps: 在BuddySystem中所说的”page”指的是order-n的pages.

Question

  1. 物理内存是什么时候交给BuddySystem中管理的, 哪些物理内存会由BuddySystem管理?
  2. BuddySystem如何完成一次分配? 由理想情况到恶劣情况?
  3. BuddySystem如何完成一次释放?

初始化

free_area初始化

free_area管理一个zone中的所有空闲链表, 按照order分成多个 空闲链表的集合 , 每个集合中有MIGRATE_TYPES条链表.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
<start_kernel()->setup_arch()->bootmem_init()->zone_sizes_init()->...->zone_init_free_lists()>

#define for_each_migratetype_order(order, type) \
for (order = 0; order < MAX_ORDER; order++) \
for (type = 0; type < MIGRATE_TYPES; type++)


static void __meminit zone_init_free_lists(struct zone *zone)
{
unsigned int order, t;
for_each_migratetype_order(order, t) {
INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
zone->free_area[order].nr_free = 0;
}
}

build_all_zonelists_init

完成内存分配时需要的node->zonelists的建立.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
<start_kernel()->build_all_zonelists()>
static noinline void __init
build_all_zonelists_init(void)
{
int cpu;

__build_all_zonelists(NULL);

/*
* Initialize the boot_pagesets that are going to be used
* for bootstrapping processors. The real pagesets for
* each zone will be allocated later when the per cpu
* allocator is available.
*
* boot_pagesets are used also for bootstrapping offline
* cpus if the system is already booted because the pagesets
* are needed to initialize allocators on a specific cpu too.
* F.e. the percpu allocator needs the page allocator which
* needs the percpu allocator in order to allocate its pagesets
* (a chicken-egg dilemma).
*/
for_each_possible_cpu(cpu)
setup_pageset(&per_cpu(boot_pageset, cpu), 0);

mminit_verify_zonelist();
cpuset_init_current_mems_allowed();
}

__build_all_zonelists遍历所有的节点调用build_zonelists

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
static void __build_all_zonelists(void *data)
{
int nid;
int __maybe_unused cpu;
pg_data_t *self = data;
static DEFINE_SPINLOCK(lock);

spin_lock(&lock);

#ifdef CONFIG_NUMA
memset(node_load, 0, sizeof(node_load));
#endif

/*
* This node is hotadded and no memory is yet present. So just
* building zonelists is fine - no need to touch other nodes.
*/
if (self && !node_online(self->node_id)) {
build_zonelists(self);
} else {
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);

build_zonelists(pgdat);
}

#ifdef CONFIG_HAVE_MEMORYLESS_NODES
/*
* We now know the "local memory node" for each node--
* i.e., the node of the first zone in the generic zonelist.
* Set up numa_mem percpu variable for on-line cpus. During
* boot, only the boot cpu should be on-line; we'll init the
* secondary cpus' numa_mem as they come on-line. During
* node/memory hotplug, we'll fixup all on-line cpus.
*/
for_each_online_cpu(cpu)
set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
#endif
}

spin_unlock(&lock);
}

build_zonelists先根据node_distance递增顺序计算出node_order数组,调用build_zonelists_in_node_order.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
static void build_zonelists(pg_data_t *pgdat)
{
static int node_order[MAX_NUMNODES];
int node, load, nr_nodes = 0;
nodemask_t used_mask;
int local_node, prev_node;

/* NUMA-aware ordering of nodes */
local_node = pgdat->node_id;
load = nr_online_nodes;
prev_node = local_node;
nodes_clear(used_mask);

memset(node_order, 0, sizeof(node_order));
while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
/*
* We don't want to pressure a particular node.
* So adding penalty to the first node in same
* distance group to make it round-robin.
*/
if (node_distance(local_node, node) !=
node_distance(local_node, prev_node))
node_load[node] = load;

node_order[nr_nodes++] = node;
prev_node = node;
load--;
}

build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
build_thisnode_zonelists(pgdat);
}

build_zonelists_in_node_order按顺序遍历node_order数组, 将这些node中被BuddySystem管理的zone加入pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs, 用作内存不足时的回退zone.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
/*
* Build zonelists ordered by node and zones within node.
* This results in maximum locality--normal zone overflows into local
* DMA zone, if any--but risks exhausting DMA zone.
*/
static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
unsigned nr_nodes)
{
struct zoneref *zonerefs;
int i;

zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;

for (i = 0; i < nr_nodes; i++) {
int nr_zones;

pg_data_t *node = NODE_DATA(node_order[i]);

nr_zones = build_zonerefs_node(node, zonerefs);
zonerefs += nr_zones;
}
zonerefs->zone = NULL;
zonerefs->zone_idx = 0;
}

而对于不允许回退的分配(__GFP_THISNODE)来说, 则只允许使用thisnode的zone.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
/*
* Build gfp_thisnode zonelists
*/
static void build_thisnode_zonelists(pg_data_t *pgdat)
{
struct zoneref *zonerefs;
int nr_zones;

zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs;
nr_zones = build_zonerefs_node(pgdat, zonerefs);
zonerefs += nr_zones;
zonerefs->zone = NULL;
zonerefs->zone_idx = 0;
}

物理内存交接

BuddySystem内管理的空间从哪来? 在mem_init时, 通过memblock_free_all释放空闲的物理内存到BuddySystem中.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
/*
* mem_init() marks the free areas in the mem_map and tells us how much memory
* is free. This is done after various parts of the system have claimed their
* memory after the kernel image.
*/
void __init mem_init(void)
{
if (swiotlb_force == SWIOTLB_FORCE ||
max_pfn > (arm64_dma_phys_limit >> PAGE_SHIFT))
swiotlb_init(1);
else
swiotlb_force = SWIOTLB_NO_FORCE;

set_max_mapnr(pfn_to_page(max_pfn) - mem_map);

#ifndef CONFIG_SPARSEMEM_VMEMMAP
free_unused_memmap();
#endif
/* this will put all unused low memory onto the freelists */
memblock_free_all();

kexec_reserve_crashkres_pages();

mem_init_print_info(NULL);

/*
* Check boundaries twice: Some fundamental inconsistencies can be
* detected at build time already.
*/
#ifdef CONFIG_COMPAT
BUILD_BUG_ON(TASK_SIZE_32 > DEFAULT_MAP_WINDOW_64);
#endif

if (PAGE_SIZE >= 16384 && get_num_physpages() <= 128) {
extern int sysctl_overcommit_memory;
/*
* On a machine this small we won't get anywhere without
* overcommit, so turn it on by default.
*/
sysctl_overcommit_memory = OVERCOMMIT_ALWAYS;
}
}
  • 每个online node的每个zone的managed_pages置0.
  • free_low_memory_core_early
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    /**
    * memblock_free_all - release free pages to the buddy allocator
    *
    * Return: the number of pages actually released.
    */
    unsigned long __init memblock_free_all(void)
    {
    unsigned long pages;

    reset_all_zones_managed_pages();

    pages = free_low_memory_core_early();
    totalram_pages_add(pages);

    return pages;
    }

free_low_memory_core_early将memblock中空闲的内存通过__free_pages送入BuddySystem.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
static unsigned long __init free_low_memory_core_early(void)
{
unsigned long count = 0;
phys_addr_t start, end;
u64 i;

memblock_clear_hotplug(0, -1);

for_each_reserved_mem_region(i, &start, &end)
reserve_bootmem_region(start, end);

/*
* We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
* because in some case like Node0 doesn't have RAM installed
* low ram will be on Node1
*/
for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
NULL)
count += __free_memory_core(start, end);

return count;
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
static void __init __free_pages_boot_core(struct page *page, unsigned int order)
{
unsigned int nr_pages = 1 << order;
struct page *p = page;
unsigned int loop;

prefetchw(p);
for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
prefetchw(p + 1);
__ClearPageReserved(p);
set_page_count(p, 0);
}
__ClearPageReserved(p);
set_page_count(p, 0);

atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
set_page_refcounted(page);
__free_pages(page, order);
}

页面分配

页面分配的最外层api是alloc_pages.
它获取当前CPU对应的node,调用alloc_pages_node.

1
2
3
4
5
6
7
8
#define alloc_pages(gfp_mask, order) \
alloc_pages_node(numa_node_id(), gfp_mask, order)

/* Returns the number of the current Node. */
static inline int numa_node_id(void)
{
return raw_cpu_read(numa_node);
}

再转入页面分配的核心函数__alloc_pages_nodemask.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70

/*
* This is the 'heart' of the zoned buddy allocator.
*/
struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
nodemask_t *nodemask)
{
struct page *page;
unsigned int alloc_flags = ALLOC_WMARK_LOW; //水位至少达到LOW才进行本次分配
gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
struct alloc_context ac = { };

/*
* There are several places where we assume that the order value is sane
* so bail out early if the request is out of bound.
*/
if (unlikely(order >= MAX_ORDER)) {
WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
return NULL;
}

gfp_mask &= gfp_allowed_mask;
alloc_mask = gfp_mask;
if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
return NULL;

finalise_ac(gfp_mask, &ac);

/*
* Forbid the first pass from falling back to types that fragment
* memory until all local zones are considered.
*/
alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask);

/* First allocation attempt */
page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
if (likely(page))
goto out;

/*
* Apply scoped allocation constraints. This is mainly about GFP_NOFS
* resp. GFP_NOIO which has to be inherited for all allocation requests
* from a particular context which has been marked by
* memalloc_no{fs,io}_{save,restore}.
*/
alloc_mask = current_gfp_context(gfp_mask);
ac.spread_dirty_pages = false;

/*
* Restore the original nodemask if it was potentially replaced with
* &cpuset_current_mems_allowed to optimize the fast-path attempt.
*/
if (unlikely(ac.nodemask != nodemask))
ac.nodemask = nodemask;

page = __alloc_pages_slowpath(alloc_mask, order, &ac);

out:
if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) {
__free_pages(page, order);
page = NULL;
}

trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);

return page;
}
EXPORT_SYMBOL(__alloc_pages_nodemask);

内存分配准备

首先prepare_alloc_pages设置alloc_flags和alloc_context, 并进行一些早期的检查.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
int preferred_nid, nodemask_t *nodemask,
struct alloc_context *ac, gfp_t *alloc_mask,
unsigned int *alloc_flags)
{
ac->high_zoneidx = gfp_zone(gfp_mask); //内存分配最高优先级的zone

//当前CPU所在节点以及其所有备用节点中允许内存分配的内存区域.若指定了__GFP_THISNODE, 则只使用当前node的zonelist.
ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
ac->nodemask = nodemask;
ac->migratetype = gfpflags_to_migratetype(gfp_mask);


//如果当前进程被绑定到了某些CPU上, 则内存分配只能在这些CPU对应的node中进行
if (cpusets_enabled()) {
*alloc_mask |= __GFP_HARDWALL;
if (!ac->nodemask)
ac->nodemask = &cpuset_current_mems_allowed;
else
*alloc_flags |= ALLOC_CPUSET;
}

fs_reclaim_acquire(gfp_mask);
fs_reclaim_release(gfp_mask);

might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);

if (should_fail_alloc_page(gfp_mask, order))
return false;

if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
*alloc_flags |= ALLOC_CMA;

return true;
}

finalise_ac进一步设置ac的spread_dirty_pages,preferred_zoneref, 确定首选的zone.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
/* Determine whether to spread dirty pages and what the first usable zone */
static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac)
{
/* Dirty zone balancing only done in the fast path */
ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);

/*
* The preferred zone is used for statistics but crucially it is
* also used as the starting point for the zonelist iterator. It
* may get reset for allocations that ignore memory policies.
*/
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->high_zoneidx, ac->nodemask);
}

get_page_from_freelist (fastpath)

  • 遍历符合high_zoneidx和nodemask的zone
    • 检查当前允许的CPU是否在zone所在节点上
    • 检查分配后该zone是否满足脏页限制
    • 判断本次分配是否需要允许内存碎片
    • 检查当前zone剩余空间是否在指定水位线以上
      • 若否则进行内存回收
      • 若是则尝试从该zone进行分配
  • 尝试从该zone进行分配
    • rmqueue从zone中取出page
    • prep_new_page初始化新页面
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
/*
* get_page_from_freelist goes through the zonelist trying to allocate
* a page.
*/
static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
const struct alloc_context *ac)
{
struct zoneref *z;
struct zone *zone;
struct pglist_data *last_pgdat_dirty_limit = NULL;
bool no_fallback;

retry:
/*
* Scan zonelist, looking for a zone with enough free.
* See also __cpuset_node_allowed() comment in kernel/cpuset.c.
*/
no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
z = ac->preferred_zoneref;

// 遍历符合high_zoneidx和nodemask的zone.
for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
ac->nodemask) {
struct page *page;
unsigned long mark;

// 检查当前允许的CPU是否在zone所在节点上
if (cpusets_enabled() &&
(alloc_flags & ALLOC_CPUSET) &&
!__cpuset_zone_allowed(zone, gfp_mask))
continue;
/*
* When allocating a page cache page for writing, we
* want to get it from a node that is within its dirty
* limit, such that no single node holds more than its
* proportional share of globally allowed dirty pages.
* The dirty limits take into account the node's
* lowmem reserves and high watermark so that kswapd
* should be able to balance it without having to
* write pages from its LRU list.
*
* XXX: For now, allow allocations to potentially
* exceed the per-node dirty limit in the slowpath
* (spread_dirty_pages unset) before going into reclaim,
* which is important when on a NUMA setup the allowed
* nodes are together not big enough to reach the
* global limit. The proper fix for these situations
* will require awareness of nodes in the
* dirty-throttling and the flusher threads.
*/
// 脏页限制, 见上方解释
if (ac->spread_dirty_pages) {
if (last_pgdat_dirty_limit == zone->zone_pgdat)
continue;

if (!node_dirty_ok(zone->zone_pgdat)) {
last_pgdat_dirty_limit = zone->zone_pgdat;
continue;
}
}

// 如果需要回退到其他节点分配, 那么允许内存碎片, 因为内核认为本地性比避免内存碎片更重要
// (笔者并没看出在这里这么设计的作用...)
if (no_fallback && nr_online_nodes > 1 &&
zone != ac->preferred_zoneref->zone) {
int local_nid;

/*
* If moving to a remote node, retry but allow
* fragmenting fallbacks. Locality is more important
* than fragmentation avoidance.
*/
local_nid = zone_to_nid(ac->preferred_zoneref->zone);
if (zone_to_nid(zone) != local_nid) {
alloc_flags &= ~ALLOC_NOFRAGMENT;
goto retry;
}
}

mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
// 检查当前zone剩余空间是否在指定水位线以上, 满足返回true.
if (!zone_watermark_fast(zone, order, mark,
ac_classzone_idx(ac), alloc_flags)) {
int ret;

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
* Watermark failed for this zone, but see if we can
* grow this zone if it contains deferred pages.
*/
if (static_branch_unlikely(&deferred_pages)) {
if (_deferred_grow_zone(zone, order))
goto try_this_zone;
}
#endif
/* Checked here to keep the fast path fast */
BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
// 如果制定了ALLOC_NO_WATERMARKS, 直接尝试从该zone分配
if (alloc_flags & ALLOC_NO_WATERMARKS)
goto try_this_zone;

if (node_reclaim_mode == 0 ||
!zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
continue;

// 进行内存回收
ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
switch (ret) {
case NODE_RECLAIM_NOSCAN:
/* did not scan */
continue;
case NODE_RECLAIM_FULL:
/* scanned but unreclaimable */
continue;
default:
/* did we reclaim enough */
if (zone_watermark_ok(zone, order, mark,
ac_classzone_idx(ac), alloc_flags))
goto try_this_zone;

continue;
}
}

try_this_zone:
// 尝试从该zone进行分配
page = rmqueue(ac->preferred_zoneref->zone, zone, order,
gfp_mask, alloc_flags, ac->migratetype);

// 初始化新页面
if (page) {
prep_new_page(page, order, gfp_mask, alloc_flags);

/*
* If this is a high-order atomic allocation then check
* if the pageblock should be reserved for the future
*/
if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
reserve_highatomic_pageblock(page, zone, order);

return page;
} else {
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/* Try again if zone has deferred pages */
if (static_branch_unlikely(&deferred_pages)) {
if (_deferred_grow_zone(zone, order))
goto try_this_zone;
}
#endif
}
}

/*
* It's possible on a UMA machine to get through all zones that are
* fragmented. If avoiding fragmentation, reset and try again.
*/
if (no_fallback) {
alloc_flags &= ~ALLOC_NOFRAGMENT;
goto retry;
}

return NULL;
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
unsigned int alloc_flags)
{
int i;

post_alloc_hook(page, order, gfp_flags);

if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO))
// 清空页面
for (i = 0; i < (1 << order); i++)
clear_highpage(page + i);

if (order && (gfp_flags & __GFP_COMP))
// 初始化复合页
prep_compound_page(page, order);

/*
* page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to
* allocate the page. The expectation is that the caller is taking
* steps that will free more memory. The caller should avoid the page
* being used for !PFMEMALLOC purposes.
*/
if (alloc_flags & ALLOC_NO_WATERMARKS)
//设置pfmemalloc标志.
set_page_pfmemalloc(page);
else
clear_page_pfmemalloc(page);
}

rmqueue

rmqueue是实质性从指定zone中完成一次分配的操作, 涉及过程较复杂(且dispatch很多, 分析着会比较乱), 是BuddySystem的核心分配逻辑.

rmqueue_pcplist

首先, 对于order为0的分配, 使用pcplist而不是传统意义上的BuddySystem.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
static inline
struct page *rmqueue(struct zone *preferred_zone,
struct zone *zone, unsigned int order,
gfp_t gfp_flags, unsigned int alloc_flags,
int migratetype)
{
unsigned long flags;
struct page *page;

if (likely(order == 0)) {
page = rmqueue_pcplist(preferred_zone, zone, order,
gfp_flags, migratetype, alloc_flags);
goto out;
}

__rmqueue_pcplist: 如果对应pcp_list不为空, 取出第一个page.否则调用rmqueue_bulk向BuddySystem请求pcp->batch个页面加入到pcp_list中, 然后再正常取出第一个page.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
/* Remove page from the per-cpu list, caller must protect the list */
static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
unsigned int alloc_flags,
struct per_cpu_pages *pcp,
struct list_head *list)
{
struct page *page;

do {
if (list_empty(list)) {
pcp->count += rmqueue_bulk(zone, 0,
pcp->batch, list,
migratetype, alloc_flags);
if (unlikely(list_empty(list)))
return NULL;
}

page = list_first_entry(list, struct page, lru);
list_del(&page->lru);
pcp->count--;
} while (check_new_pcp(page));

return page;
}

rmqueue_bulk可以看作对__rmqueue的一个循环wrapper

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
/*
* Obtain a specified number of elements from the buddy allocator, all under
* a single hold of the lock, for efficiency. Add them to the supplied list.
* Returns the number of new pages which were placed at *list.
*/
static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
int migratetype, unsigned int alloc_flags)
{
int i, alloced = 0;

spin_lock(&zone->lock);
for (i = 0; i < count; ++i) {
struct page *page = __rmqueue(zone, order, migratetype,
alloc_flags);
if (unlikely(page == NULL))
break;

if (unlikely(check_pcp_refill(page)))
continue;

/*
* Split buddy pages returned by expand() are received here in
* physical page order. The page is added to the tail of
* caller's list. From the callers perspective, the linked list
* is ordered by page number under some conditions. This is
* useful for IO devices that can forward direction from the
* head, thus also in the physical page order. This is useful
* for IO devices that can merge IO requests if the physical
* pages are ordered properly.
*/
list_add_tail(&page->lru, list);
alloced++;
if (is_migrate_cma(get_pcppage_migratetype(page)))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
-(1 << order));
}

/*
* i pages were removed from the buddy list even if some leak due
* to check_pcp_refill failing so adjust NR_FREE_PAGES based
* on i. Do not confuse with 'alloced' which is the number of
* pages added to the pcp list.
*/
__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
spin_unlock(&zone->lock);
return alloced;
}

__rmqueue:

  • 调用__rmqueue_smallest
    • 从当前order的free_area开始遍历
      • 若area->free_list[migratetype]为空, 则尝试从下一级order获取页面.
      • 直到获取到页面
        • 将页面从链表中移除
        • page->private=0
        • 调用expand将页面展开, 填充低order的free_area.
        • page->index=migratetype
  • 如果没获取到, 且迁移类型为MIGRATE_MOVABLE, 则尝试回退分配__rmqueue_smallest(zone, order, MIGRATE_CMA)
  • 如果还是没分配到, 调用__rmqueue_fallback继续分配,该函数较复杂, 不在这里分析.
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    /*
    * Do the hard work of removing an element from the buddy allocator.
    * Call me with the zone->lock already held.
    */
    static __always_inline struct page *
    __rmqueue(struct zone *zone, unsigned int order, int migratetype,
    unsigned int alloc_flags)
    {
    struct page *page;

    retry:
    page = __rmqueue_smallest(zone, order, migratetype);
    if (unlikely(!page)) {
    if (migratetype == MIGRATE_MOVABLE)
    page = __rmqueue_cma_fallback(zone, order);

    if (!page && __rmqueue_fallback(zone, order, migratetype,
    alloc_flags))
    goto retry;
    }

    trace_mm_page_alloc_zone_locked(page, order, migratetype);
    return page;
    }

    /*
    * Go through the free lists for the given migratetype and remove
    * the smallest available page from the freelists
    */
    static __always_inline
    struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
    int migratetype)
    {
    unsigned int current_order;
    struct free_area *area;
    struct page *page;

    /* Find a page of the appropriate size in the preferred list */
    for (current_order = order; current_order < MAX_ORDER; ++current_order) {
    area = &(zone->free_area[current_order]);
    page = list_first_entry_or_null(&area->free_list[migratetype],
    struct page, lru);
    if (!page)
    continue;
    list_del(&page->lru);
    rmv_page_order(page);
    area->nr_free--;
    expand(zone, page, order, current_order, area, migratetype);
    set_pcppage_migratetype(page, migratetype);
    return page;
    }

    return NULL;
    }


其中expand的操作与”Buddy”一词联系紧密.
从本次实际获取到的页面的order开始递减, 直到达到本次请求的order, 将当前页面分成两半, 每次将后一部分加入
area[cur_order]->freelist[migratetype], 并设置page->private=cur_order.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
/*
* The order of subdivision here is critical for the IO subsystem.
* Please do not alter this order without good reasons and regression
* testing. Specifically, as large blocks of memory are subdivided,
* the order in which smaller blocks are delivered depends on the order
* they're subdivided in this function. This is the primary factor
* influencing the order in which pages are delivered to the IO
* subsystem according to empirical testing, and this is also justified
* by considering the behavior of a buddy system containing a single
* large block of memory acted on by a series of small allocations.
* This behavior is a critical factor in sglist merging's success.
*
* -- nyc
*/
static inline void expand(struct zone *zone, struct page *page,
int low, int high, struct free_area *area,
int migratetype)
{
unsigned long size = 1 << high;

while (high > low) {
area--;
high--;
size >>= 1;
VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);

/*
* Mark as guard pages (or page), that will allow to
* merge back to allocator when buddy will be freed.
* Corresponding page table entries will not be touched,
* pages will stay not present in virtual address space
*/
if (set_page_guard(zone, &page[size], high, migratetype))
continue;

list_add(&page[size].lru, &area->free_list[migratetype]);
area->nr_free++;
set_page_order(&page[size], high);
}
}

下方是一个分配前后的例子. 请求的是order 0的页面.

常规分配

(分割线以下)
如果指定了ALLOC_HARDER策略, 则尝试从MIGRATE_HIGHATOMIC迁移类型进行分配.
否则调用__rmqueue(上文分析过)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
static inline
struct page *rmqueue(struct zone *preferred_zone,
struct zone *zone, unsigned int order,
gfp_t gfp_flags, unsigned int alloc_flags,
int migratetype)
{
unsigned long flags;
struct page *page;

if (likely(order == 0)) {
page = rmqueue_pcplist(preferred_zone, zone, order,
gfp_flags, migratetype, alloc_flags);
goto out;
}

/*
* We most definitely don't want callers attempting to
* allocate greater than order-1 page units with __GFP_NOFAIL.
*/
WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
spin_lock_irqsave(&zone->lock, flags);

-----------------------------------------------------------------------------------


do {
page = NULL;
if (alloc_flags & ALLOC_HARDER) {
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
if (page)
trace_mm_page_alloc_zone_locked(page, order, migratetype);
}
if (!page)
page = __rmqueue(zone, order, migratetype, alloc_flags);
} while (page && check_new_pages(page, order));

__rmqueue_fallback

接下来分析回退处理.

有一个fallbacks二维数组, 指示当某个迁移类型的页面分配失败时, 可以回退到的其他迁移类型.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
/*
* This array describes the order lists are fallen back to when
* the free lists for the desirable migrate type are depleted
*/
static int fallbacks[MIGRATE_TYPES][4] = {
[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
#ifdef CONFIG_CMA
[MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */
#endif
#ifdef CONFIG_MEMORY_ISOLATION
[MIGRATE_ISOLATE] = { MIGRATE_TYPES }, /* Never used */
#endif
};

find_suitable_fallback函数查找适合的回退页。如果找到合适的页,则返回对应的迁移类型.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
/*
* Check whether there is a suitable fallback freepage with requested order.
* If only_stealable is true, this function returns fallback_mt only if
* we can steal other freepages all together. This would help to reduce
* fragmentation due to mixed migratetype pages in one pageblock.
*/
int find_suitable_fallback(struct free_area *area, unsigned int order,
int migratetype, bool only_stealable, bool *can_steal)
{
int i;
int fallback_mt;

if (area->nr_free == 0)
return -1;

*can_steal = false;
for (i = 0;; i++) {
fallback_mt = fallbacks[migratetype][i];
if (fallback_mt == MIGRATE_TYPES)
break;

if (list_empty(&area->free_list[fallback_mt]))
continue;

if (can_steal_fallback(order, migratetype))
*can_steal = true;

if (!only_stealable)
return fallback_mt;

if (*can_steal)
return fallback_mt;
}

return -1;
}

这里的适合有两个含义.

  1. 新的migratetype在原本的migratetype的fallback数组中
  2. can_steal_fallback: 能否偷取整个页块. (有不能偷取整个页块但”适合”的例外)
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    /*
    * When we are falling back to another migratetype during allocation, try to
    * steal extra free pages from the same pageblocks to satisfy further
    * allocations, instead of polluting multiple pageblocks.
    *
    * If we are stealing a relatively large buddy page, it is likely there will
    * be more free pages in the pageblock, so try to steal them all. For
    * reclaimable and unmovable allocations, we steal regardless of page size,
    * as fragmentation caused by those allocations polluting movable pageblocks
    * is worse than movable allocations stealing from unmovable and reclaimable
    * pageblocks.
    */
    // 当我们偷取相对较大的伙伴页时, 页块中可能会出现更多的空闲页, 所以尝试偷取全部空闲页.
    // 对于对可回收类型和不可移动类型的分配, 则不考虑页面大小.(注释这里的解释笔者暂时还没理解)
    static bool can_steal_fallback(unsigned int order, int start_mt)
    {
    /*
    * Leaving this order check is intended, although there is
    * relaxed order check in next check. The reason is that
    * we can actually steal whole pageblock if this condition met,
    * but, below check doesn't guarantee it and that is just heuristic
    * so could be changed anytime.
    */
    if (order >= pageblock_order)
    return true;

    if (order >= pageblock_order / 2 ||
    start_mt == MIGRATE_RECLAIMABLE ||
    start_mt == MIGRATE_UNMOVABLE ||
    page_group_by_mobility_disabled)
    return true;

    return false;
    }

__rmqueue_fallback:

  • 先从高order遍历, 尝试找到最大的合适的可回退页类型.
    • 若找到合适的页面, 调用steal_suitable_fallback.
    • 如果不能从页块中窃取所有空闲页,并且请求的迁移类型是可移动的, 那么为了防止永久碎片, 从低order遍历尝试找到最小的合适的可回退页类型.
    • 否则返回false, 回退分配失败.
      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      23
      24
      25
      26
      27
      28
      29
      30
      31
      32
      33
      34
      35
      36
      37
      38
      39
      40
      41
      42
      43
      44
      45
      46
      47
      48
      49
      50
      51
      52
      53
      54
      55
      56
      57
      58
      59
      60
      61
      62
      63
      64
      65
      66
      67
      68
      69
      70
      71
      72
      73
      74
      75
      76
      77
      78
      79
      80
      81
      82
      83
      84
      85
      86
      87
      88
      89
      90
      91
      92
      93
      94
      /*
      * Try finding a free buddy page on the fallback list and put it on the free
      * list of requested migratetype, possibly along with other pages from the same
      * block, depending on fragmentation avoidance heuristics. Returns true if
      * fallback was found so that __rmqueue_smallest() can grab it.
      *
      * The use of signed ints for order and current_order is a deliberate
      * deviation from the rest of this file, to make the for loop
      * condition simpler.
      */
      static __always_inline bool
      __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
      unsigned int alloc_flags)
      {
      struct free_area *area;
      int current_order;
      int min_order = order;
      struct page *page;
      int fallback_mt;
      bool can_steal;

      /*
      * Do not steal pages from freelists belonging to other pageblocks
      * i.e. orders < pageblock_order. If there are no local zones free,
      * the zonelists will be reiterated without ALLOC_NOFRAGMENT.
      */
      if (alloc_flags & ALLOC_NOFRAGMENT)
      min_order = pageblock_order;

      /*
      * Find the largest available free page in the other list. This roughly
      * approximates finding the pageblock with the most free pages, which
      * would be too costly to do exactly.
      */
      for (current_order = MAX_ORDER - 1; current_order >= min_order;
      --current_order) {
      area = &(zone->free_area[current_order]);
      fallback_mt = find_suitable_fallback(area, current_order,
      start_migratetype, false, &can_steal);
      if (fallback_mt == -1)
      continue;

      /*
      * We cannot steal all free pages from the pageblock and the
      * requested migratetype is movable. In that case it's better to
      * steal and split the smallest available page instead of the
      * largest available page, because even if the next movable
      * allocation falls back into a different pageblock than this
      * one, it won't cause permanent fragmentation.
      */
      /*
      * 我们不能从页块中窃取所有空闲页,并且请求的迁移类型是可移动的。
      * 在这种情况下,窃取并拆分最小的可用页比窃取最大的可用页更好,
      * 因为即使下一次可移动的分配退回到不同的页块,也不会造成永久性的碎片化。
      */
      if (!can_steal && start_migratetype == MIGRATE_MOVABLE
      && current_order > order)
      goto find_smallest;

      goto do_steal;
      }

      return false;

      find_smallest:
      for (current_order = order; current_order < MAX_ORDER;
      current_order++) {
      area = &(zone->free_area[current_order]);
      fallback_mt = find_suitable_fallback(area, current_order,
      start_migratetype, false, &can_steal);
      if (fallback_mt != -1)
      break;
      }

      /*
      * This should not happen - we already found a suitable fallback
      * when looking for the largest page.
      */
      VM_BUG_ON(current_order == MAX_ORDER);

      do_steal:
      page = list_first_entry(&area->free_list[fallback_mt],
      struct page, lru);

      steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
      can_steal);

      trace_mm_page_alloc_extfrag(page, order, current_order,
      start_migratetype, fallback_mt);

      return true;

      }

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
/*
* 这个函数实现了实际的窃取行为。如果 `order` 足够大,我们可以窃取整个页块。
* 如果不够大,我们首先将该页块中的空闲页移动到我们的迁移类型,并确定页块中已经分配的页
* 有多少具有兼容的迁移类型。如果至少一半的页是空闲的或兼容的,我们可以改变整个页块的迁移类型,
* 这样将来释放的页就会被放到正确的空闲列表中。
*/

static void steal_suitable_fallback(struct zone *zone, struct page *page,
unsigned int alloc_flags, int start_type, bool whole_block)
{
unsigned int current_order = page_order(page);
struct free_area *area;
int free_pages, movable_pages, alike_pages;
int old_block_type;

old_block_type = get_pageblock_migratetype(page);

/*
* This can happen due to races and we want to prevent broken
* highatomic accounting.
*/
if (is_migrate_highatomic(old_block_type))
goto single_page;

/* Take ownership for orders >= pageblock_order */
if (current_order >= pageblock_order) {
change_pageblock_range(page, current_order, start_type);
goto single_page;
}

/*
* Boost watermarks to increase reclaim pressure to reduce the
* likelihood of future fallbacks. Wake kswapd now as the node
* may be balanced overall and kswapd will not wake naturally.
*/
boost_watermark(zone);
if (alloc_flags & ALLOC_KSWAPD)
set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);

/* We are not allowed to try stealing from the whole block */
if (!whole_block)
goto single_page;

free_pages = move_freepages_block(zone, page, start_type,
&movable_pages);
/*
* Determine how many pages are compatible with our allocation.
* For movable allocation, it's the number of movable pages which
* we just obtained. For other types it's a bit more tricky.
*/
if (start_type == MIGRATE_MOVABLE) {
alike_pages = movable_pages;
} else {
/*
* If we are falling back a RECLAIMABLE or UNMOVABLE allocation
* to MOVABLE pageblock, consider all non-movable pages as
* compatible. If it's UNMOVABLE falling back to RECLAIMABLE or
* vice versa, be conservative since we can't distinguish the
* exact migratetype of non-movable pages.
*/
if (old_block_type == MIGRATE_MOVABLE)
alike_pages = pageblock_nr_pages
- (free_pages + movable_pages);
else
alike_pages = 0;
}

/* moving whole block can fail due to zone boundary conditions */
if (!free_pages)
goto single_page;

/*
* If a sufficient number of pages in the block are either free or of
* comparable migratability as our allocation, claim the whole block.
*/
if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
page_group_by_mobility_disabled)
set_pageblock_migratetype(page, start_type);

return;

single_page:
area = &zone->free_area[current_order];
list_move(&page->lru, &area->free_list[start_type]);
}

__alloc_pages_slowpath (slowpath)

fastpath失败后, 内存分配转入slowpath.
借用一张网上的图

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct alloc_context *ac)
{
bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
struct page *page = NULL;
unsigned int alloc_flags;
unsigned long did_some_progress;
enum compact_priority compact_priority;
enum compact_result compact_result;
int compaction_retries;
int no_progress_loops;
unsigned int cpuset_mems_cookie;
int reserve_flags;

/*
* We also sanity check to catch abuse of atomic reserves being used by
* callers that are not in atomic context.
*/
if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
gfp_mask &= ~__GFP_ATOMIC;

retry_cpuset:
compaction_retries = 0;
no_progress_loops = 0;
compact_priority = DEF_COMPACT_PRIORITY;
cpuset_mems_cookie = read_mems_allowed_begin();

/*
* The fast path uses conservative alloc_flags to succeed only until
* kswapd needs to be woken up, and to avoid the cost of setting up
* alloc_flags precisely. So we do that now.
*/
alloc_flags = gfp_to_alloc_flags(gfp_mask);

slowpath会使用比fastpath更激进的分配策略, 请求水位线调整到ALLOC_WMARK_MIN.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
static inline unsigned int
gfp_to_alloc_flags(gfp_t gfp_mask)
{
unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;

/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);

/*
* The caller may dip into page reserves a bit more if the caller
* cannot run direct reclaim, or if the caller has realtime scheduling
* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
* set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH).
*/
alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);

if (gfp_mask & __GFP_ATOMIC) {
/*
* Not worth trying to allocate harder for __GFP_NOMEMALLOC even
* if it can't schedule.
*/
if (!(gfp_mask & __GFP_NOMEMALLOC))
alloc_flags |= ALLOC_HARDER;
/*
* Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
* comment for __cpuset_node_allowed().
*/
alloc_flags &= ~ALLOC_CPUSET;
} else if (unlikely(rt_task(current)) && !in_interrupt())
alloc_flags |= ALLOC_HARDER;

if (gfp_mask & __GFP_KSWAPD_RECLAIM)
alloc_flags |= ALLOC_KSWAPD;

#ifdef CONFIG_CMA
if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
alloc_flags |= ALLOC_CMA;
#endif
return alloc_flags;
}

若设置了__GFP_KSWAPD_RECLAIM, 会在此时唤醒kswapds进行异步内存回收.

1
2
if (alloc_flags & ALLOC_KSWAPD)
wake_all_kswapds(order, gfp_mask, ac);

调整后的内存策略可能会导致快速路径的成功, 所以再尝试一次快速路径.

1
2
3
4
5
6
7
/*
* The adjusted alloc_flags might result in immediate success, so try
* that first
*/
page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
if (page)
goto got_pg;

若失败, 则尝试进行内存规整. __alloc_pages_direct_compact函数在实际内存规整后还会进行一次快速路径的尝试.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
/*
* For costly allocations, try direct compaction first, as it's likely
* that we have enough base pages and don't need to reclaim. For non-
* movable high-order allocations, do that as well, as compaction will
* try prevent permanent fragmentation by migrating from blocks of the
* same migratetype.
* Don't try this for allocations that are allowed to ignore
* watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
*/
if (can_direct_reclaim &&
(costly_order ||
(order > 0 && ac->migratetype != MIGRATE_MOVABLE))
&& !gfp_pfmemalloc_allowed(gfp_mask)) {
page = __alloc_pages_direct_compact(gfp_mask, order,
alloc_flags, ac,
INIT_COMPACT_PRIORITY,
&compact_result);
if (page)
goto got_pg;

若还是失败, 将进入一个可能会不断retry的阶段.
在每次retry开始, 先再次唤醒kswapds.

1
2
3
4
retry:
/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
if (alloc_flags & ALLOC_KSWAPD)
wake_all_kswapds(order, gfp_mask, ac);

尝试忽略水位线限制.

1
2
3
reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
if (reserve_flags)
alloc_flags = reserve_flags;

根据调整后的策略重新选出preferred_zoneref. 并进行一次快速路径的尝试.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
/*
* Reset the nodemask and zonelist iterators if memory policies can be
* ignored. These allocations are high priority and system rather than
* user oriented.
*/
if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
ac->nodemask = NULL;
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->high_zoneidx, ac->nodemask);
}
/* Attempt with potentially adjusted zonelist and alloc_flags */
page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
if (page)
goto got_pg;

若还是失败, 则不得不进行直接内存回收了, 如果不允许直接内存回收, 跳转到nopage.

1
2
3
4
5
6
7
8
/* Caller is not willing to reclaim, we can't balance anything */
if (!can_direct_reclaim)
goto nopage;

/* Avoid recursion of direct reclaim */
if (current->flags & PF_MEMALLOC)
goto nopage;

进行一次直接内存回收和一次直接内存规整.

1
2
3
4
5
6
7
8
9
10
11
/* Try direct reclaim and then allocating */
page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
&did_some_progress);
if (page)
goto got_pg;

/* Try direct compaction and then allocating */
page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
compact_priority, &compact_result);
if (page)
goto got_pg;

若还未分配到, 则本次尝试已经失败, 如果不能retry, 则跳转到nopage.

1
2
3
4
5
6
7
8
9
10
11
/* Do not loop if specifically requested */
if (gfp_mask & __GFP_NORETRY)
goto nopage;

/*
* Do not retry costly high order allocations unless they are
* __GFP_RETRY_MAYFAIL
*/
if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
goto nopage;

在retry之前, 需要先判断本次retry中reclaim是否是有意义的.下列两个条件满足任一即为无意义.

  1. 如果内核已经重试了 MAX_RECLAIM_RETRIES(16)次仍然没有任何效果或失败
  2. 如果内核将所有可选内存区域中的所有可回收页面全部回收之后,仍然无法满足内存的分配
    1
    2
    3
    if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
    did_some_progress > 0, &no_progress_loops))
    goto retry;
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
/*
* Checks whether it makes sense to retry the reclaim to make a forward progress
* for the given allocation request.
*
* We give up when we either have tried MAX_RECLAIM_RETRIES in a row
* without success, or when we couldn't even meet the watermark if we
* reclaimed all remaining pages on the LRU lists.
*
* Returns true if a retry is viable or false to enter the oom path.
*/
static inline bool
should_reclaim_retry(gfp_t gfp_mask, unsigned order,
struct alloc_context *ac, int alloc_flags,
bool did_some_progress, int *no_progress_loops)
{
struct zone *zone;
struct zoneref *z;
bool ret = false;

/*
* Costly allocations might have made a progress but this doesn't mean
* their order will become available due to high fragmentation so
* always increment the no progress counter for them
*/
if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
*no_progress_loops = 0;
else
(*no_progress_loops)++;

/*
* Make sure we converge to OOM if we cannot make any progress
* several times in the row.
*/
if (*no_progress_loops > MAX_RECLAIM_RETRIES) {
/* Before OOM, exhaust highatomic_reserve */
return unreserve_highatomic_pageblock(ac, true);
}

/*
* Keep reclaiming pages while there is a chance this will lead
* somewhere. If none of the target zones can satisfy our allocation
* request even if all reclaimable pages are considered then we are
* screwed and have to go OOM.
*/
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
ac->nodemask) {
unsigned long available;
unsigned long reclaimable;
unsigned long min_wmark = min_wmark_pages(zone);
bool wmark;

available = reclaimable = zone_reclaimable_pages(zone);
available += zone_page_state_snapshot(zone, NR_FREE_PAGES);

/*
* Would the allocation succeed if we reclaimed all
* reclaimable pages?
*/
wmark = __zone_watermark_ok(zone, order, min_wmark,
ac_classzone_idx(ac), alloc_flags, available);
trace_reclaim_retry_zone(z, order, reclaimable,
available, min_wmark, *no_progress_loops, wmark);
if (wmark) {
/*
* If we didn't make any progress and have a lot of
* dirty + writeback pages then we should wait for
* an IO to complete to slow down the reclaim and
* prevent from pre mature OOM
*/
if (!did_some_progress) {
unsigned long write_pending;

write_pending = zone_page_state_snapshot(zone,
NR_ZONE_WRITE_PENDING);

if (2 * write_pending > reclaimable) {
congestion_wait(BLK_RW_ASYNC, HZ/10);
return true;
}
}

ret = true;
goto out;
}
}

out:
/*
* Memory allocation/reclaim might be called from a WQ context and the
* current implementation of the WQ concurrency control doesn't
* recognize that a particular WQ is congested if the worker thread is
* looping without ever sleeping. Therefore we have to do a short sleep
* here rather than calling cond_resched().
*/
if (current->flags & PF_WQ_WORKER)
schedule_timeout_uninterruptible(1);
else
cond_resched();
return ret;
}

如果内存回收没有意义, 再判断retry中内存规整是否有意义.如果 did_some_progress = 0 则没有必要在进行内存整理重试了,因为内存整理的实现依赖于足够的空闲内存量

1
2
3
4
5
6
7
8
9
10
11
12
/*
* It doesn't make any sense to retry for the compaction if the order-0
* reclaim is not able to make any progress because the current
* implementation of the compaction depends on the sufficient amount
* of free memory (see __compaction_suitable)
*/
if (did_some_progress > 0 &&
should_compact_retry(ac, order, alloc_flags,
compact_result, &compact_priority,
&compaction_retries))
goto retry;

检查在分配过程中cpuset是否由于并发发生改变, 若发生改变则重新进入慢速路径.

1
2
3
4
/* Deal with possible cpuset update races before we start OOM killing */
if (check_retry_cpuset(cpuset_mems_cookie, ac))
goto retry_cpuset;

尝试启动OOM机制, 释放某个得分高的进程的内存.如果有作用就跳转到retry.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
/* Reclaim has failed us, start killing things */
page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
if (page)
goto got_pg;

/* Avoid allocations with no watermarks from looping endlessly */
if (tsk_is_oom_victim(current) &&
(alloc_flags == ALLOC_OOM ||
(gfp_mask & __GFP_NOMEMALLOC)))
goto nopage;

/* Retry as long as the OOM killer is making progress */
if (did_some_progress) {
no_progress_loops = 0;
goto retry;
}

retry部分自此结束.下面看nopage部分.

跳转到nopage部分不代表无路可走. 如果分配过程中cpuset由于并发发生改变, 则重新进行慢速路径.
如果开启了__GFP_NOFAIL, 那么将会尝试使用保留的页面或回退到其他节点进行内存分配. 若失败则先调度其他进程, 然后不断retry.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
nopage:
/* Deal with possible cpuset update races before we fail */
if (check_retry_cpuset(cpuset_mems_cookie, ac))
goto retry_cpuset;

/*
* Make sure that __GFP_NOFAIL request doesn't leak out and make sure
* we always retry
*/
if (gfp_mask & __GFP_NOFAIL) {
/*
* All existing users of the __GFP_NOFAIL are blockable, so warn
* of any new users that actually require GFP_NOWAIT
*/
if (WARN_ON_ONCE(!can_direct_reclaim))
goto fail;

/*
* PF_MEMALLOC request from this context is rather bizarre
* because we cannot reclaim anything and only can loop waiting
* for somebody to do a work for us
*/
WARN_ON_ONCE(current->flags & PF_MEMALLOC);

/*
* non failing costly orders are a hard requirement which we
* are not prepared for much so let's warn about these users
* so that we can identify them and convert them to something
* else.
*/
WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);

/*
* Help non-failing allocations by giving them access to memory
* reserves but do not use ALLOC_NO_WATERMARKS because this
* could deplete whole memory reserves which would just make
* the situation worse
*/
page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);
if (page)
goto got_pg;

cond_resched();
goto retry;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
static inline struct page *
__alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags,
const struct alloc_context *ac)
{
struct page *page;

page = get_page_from_freelist(gfp_mask, order,
alloc_flags|ALLOC_CPUSET, ac);
/*
* fallback to ignore cpuset restriction if our nodes
* are depleted
*/
if (!page)
page = get_page_from_freelist(gfp_mask, order,
alloc_flags, ac);

return page;
}

页面释放

借用网上的图:

__free_pages 是页面释放的入口点.它将page->count减一并测试是否为0, 若引用计数为0调用free_the_page释放该页面.

1
2
3
4
5
6
7
void __free_pages(struct page *page, unsigned int order)
{
if (put_page_testzero(page))
free_the_page(page, order);
}
EXPORT_SYMBOL(__free_pages);

和分配一样, free_the_page中对可以释放到pcplist中的order 0的页面做了特化.

1
2
3
4
5
6
7
static inline void free_the_page(struct page *page, unsigned int order)
{
if (order == 0) /* Via pcp? */
free_unref_page(page);
else
__free_pages_ok(page, order);
}

free_unref_page

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
/*
* Free a 0-order page
*/
void free_unref_page(struct page *page)
{
unsigned long flags;
unsigned long pfn = page_to_pfn(page);

if (!free_unref_page_prepare(page, pfn))
return;

local_irq_save(flags);
free_unref_page_commit(page, pfn);
local_irq_restore(flags);
}

如果迁移类型在MIGRATE_PCPTYPES中, 直接加入pcp->lists[migratetype]. 如果插入后pcp->count >= pcp->high, 则释放batch个页面回BuddySystem. 否则通过free_one_page释放页面回BuddySystem.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
static void free_unref_page_commit(struct page *page, unsigned long pfn)
{
struct zone *zone = page_zone(page);
struct per_cpu_pages *pcp;
int migratetype;

migratetype = get_pcppage_migratetype(page);
__count_vm_event(PGFREE);

/*
* We only track unmovable, reclaimable and movable on pcp lists.
* Free ISOLATE pages back to the allocator because they are being
* offlined but treat HIGHATOMIC as movable pages so we can get those
* areas back if necessary. Otherwise, we may have to free
* excessively into the page allocator
*/
if (migratetype >= MIGRATE_PCPTYPES) {
if (unlikely(is_migrate_isolate(migratetype))) {
free_one_page(zone, page, pfn, 0, migratetype);
return;
}
migratetype = MIGRATE_MOVABLE;
}

pcp = &this_cpu_ptr(zone->pageset)->pcp;
list_add(&page->lru, &pcp->lists[migratetype]);
pcp->count++;
if (pcp->count >= pcp->high) {
unsigned long batch = READ_ONCE(pcp->batch);
free_pcppages_bulk(zone, batch, pcp);
}
}

free_pcppages_bulk根据当前pcp各freelist的负载来决定如何从各freelist中取出这count张page, 组成临时链表, 再遍历调用__free_one_page进行释放.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
/*
* Frees a number of pages from the PCP lists
* Assumes all pages on list are in same zone, and of same order.
* count is the number of pages to free.
*
* If the zone was previously in an "all pages pinned" state then look to
* see if this freeing clears that state.
*
* And clear the zone's pages_scanned counter, to hold off the "all pages are
* pinned" detection logic.
*/
static void free_pcppages_bulk(struct zone *zone, int count,
struct per_cpu_pages *pcp)
{
int migratetype = 0;
int batch_free = 0;
int prefetch_nr = 0;
bool isolated_pageblocks;
struct page *page, *tmp;
LIST_HEAD(head);

while (count) {
struct list_head *list;

/*
* Remove pages from lists in a round-robin fashion. A
* batch_free count is maintained that is incremented when an
* empty list is encountered. This is so more pages are freed
* off fuller lists instead of spinning excessively around empty
* lists
*/
do {
batch_free++;
if (++migratetype == MIGRATE_PCPTYPES)
migratetype = 0;
list = &pcp->lists[migratetype];
} while (list_empty(list));

/* This is the only non-empty list. Free them all. */
if (batch_free == MIGRATE_PCPTYPES)
batch_free = count;

do {
page = list_last_entry(list, struct page, lru);
/* must delete to avoid corrupting pcp list */
list_del(&page->lru);
pcp->count--;

if (bulkfree_pcp_prepare(page))
continue;

list_add_tail(&page->lru, &head);

/*
* We are going to put the page back to the global
* pool, prefetch its buddy to speed up later access
* under zone->lock. It is believed the overhead of
* an additional test and calculating buddy_pfn here
* can be offset by reduced memory latency later. To
* avoid excessive prefetching due to large count, only
* prefetch buddy for the first pcp->batch nr of pages.
*/
if (prefetch_nr++ < pcp->batch)
prefetch_buddy(page);
} while (--count && --batch_free && !list_empty(list));
}

spin_lock(&zone->lock);
isolated_pageblocks = has_isolate_pageblock(zone);

/*
* Use safe version since after __free_one_page(),
* page->lru.next will not point to original list.
*/
list_for_each_entry_safe(page, tmp, &head, lru) {
int mt = get_pcppage_migratetype(page);
/* MIGRATE_ISOLATE page should not go to pcplists */
VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
/* Pageblock could have been isolated meanwhile */
if (unlikely(isolated_pageblocks))
mt = get_pageblock_migratetype(page);

__free_one_page(page, page_to_pfn(page), zone, 0, mt);
trace_mm_page_pcpu_drain(page, 0, mt);
}
spin_unlock(&zone->lock);
}

__free_pages_ok

free_one_page的wrapper.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17

static void __free_pages_ok(struct page *page, unsigned int order)
{
unsigned long flags;
int migratetype;
unsigned long pfn = page_to_pfn(page);

if (!free_pages_prepare(page, order, true))
return;

migratetype = get_pfnblock_migratetype(page, pfn);
local_irq_save(flags);
__count_vm_events(PGFREE, 1 << order);
free_one_page(page_zone(page), page, pfn, order, migratetype);
local_irq_restore(flags);
}

__free_one_page

BuddySystem页面释放的核心.

先进行一些check工作.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
static inline void __free_one_page(struct page *page,
unsigned long pfn,
struct zone *zone, unsigned int order,
int migratetype)
{
unsigned long combined_pfn;
unsigned long uninitialized_var(buddy_pfn);
struct page *buddy;
unsigned int max_order;

max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);

VM_BUG_ON(!zone_is_initialized(zone));
VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);

VM_BUG_ON(migratetype == -1);
if (likely(!is_migrate_isolate(migratetype)))
__mod_zone_freepage_state(zone, 1 << order, migratetype);

VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
VM_BUG_ON_PAGE(bad_range(zone, page), page);

然后进入一个merging阶段, BuddySystem的释放就是不断向Merge成高阶page的过程.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
continue_merging:
while (order < max_order - 1) {
buddy_pfn = __find_buddy_pfn(pfn, order);
buddy = page + (buddy_pfn - pfn);

if (!pfn_valid_within(buddy_pfn))
goto done_merging;
if (!page_is_buddy(page, buddy, order))
goto done_merging;
/*
* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
* merge with it and move up one order.
*/
if (page_is_guard(buddy)) {
clear_page_guard(zone, buddy, order, migratetype);
} else {
list_del(&buddy->lru);
zone->free_area[order].nr_free--;
rmv_page_order(buddy);
}
combined_pfn = buddy_pfn & pfn;
page = page + (combined_pfn - pfn);
pfn = combined_pfn;
order++;
}
if (max_order < MAX_ORDER) {
/* If we are here, it means order is >= pageblock_order.
* We want to prevent merge between freepages on isolate
* pageblock and normal pageblock. Without this, pageblock
* isolation could cause incorrect freepage or CMA accounting.
*
* We don't want to hit this code for the more frequent
* low-order merging.
*/
if (unlikely(has_isolate_pageblock(zone))) {
int buddy_mt;

buddy_pfn = __find_buddy_pfn(pfn, order);
buddy = page + (buddy_pfn - pfn);
buddy_mt = get_pageblock_migratetype(buddy);

if (migratetype != buddy_mt
&& (is_migrate_isolate(migratetype) ||
is_migrate_isolate(buddy_mt)))
goto done_merging;
}
max_order++;
goto continue_merging;
}

done_merging阶段, 将最终合成的高阶页面加入对应free_list. 这里有一个细节, 就是如果更高阶的相邻页面是空闲的, 意味着这些空闲页面可能很快再次合并, 所以将本次page释放到free_list的尾部, 避免被马上使用以促使更高阶页面的合并.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
done_merging:
set_page_order(page, order);

/*
* If this is not the largest possible page, check if the buddy
* of the next-highest order is free. If it is, it's possible
* that pages are being freed that will coalesce soon. In case,
* that is happening, add the free page to the tail of the list
* so it's less likely to be used soon and more likely to be merged
* as a higher order page
*/
if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) {
struct page *higher_page, *higher_buddy;
combined_pfn = buddy_pfn & pfn;
higher_page = page + (combined_pfn - pfn);
buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
higher_buddy = higher_page + (buddy_pfn - combined_pfn);
if (pfn_valid_within(buddy_pfn) &&
page_is_buddy(higher_page, higher_buddy, order + 1)) {
list_add_tail(&page->lru,
&zone->free_area[order].free_list[migratetype]);
goto out;
}
}

list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
out:
zone->free_area[order].nr_free++;
  • 版权声明: 本博客所有文章除特别声明外,著作权归作者所有。转载请注明出处!
  • Copyrights © 2022-2024 翰青HanQi

请我喝杯咖啡吧~

支付宝
微信