Linux内核 内存规整 -- ARM64 v5.0

Question

  1. 内存规整的目的?
  2. 内存规整和页面迁移的区别?
  3. 为什么将页面划分不同的迁移类型?
  4. 页面规整的流程?

概述

系统长时间运行后, 页面变得越来越分散, 分配一大块连续的物理内存变得越来越困难, 而有时又需要分配大块连续物理内存, 所以需要解决内存碎片化的问题, 这一过程叫做内存规整(内存紧缩).

内存去碎片化的基本原理是按照页面的可移动性将页面分组. 迁移内核本身使用的物理内存的实现难度和复杂度都很大, 因此目前的内核不迁移内核本身使用的物理页面. 对于用户进程使用的页面, 实际上是通过用户页表的映射访问的, 用户页表移动和修改映射关系不会影响到用户进程, 因此内存规整是基于页面迁移实现的.

内存页面被划分为可移动, 可回收, 不可移动等迁移类型. 可移动的页面通常是指用户态进程分配的内存, 移动这些页面仅仅需要修改页表映射关系. 可回收的页面是指不可以移动但可以释放的页面.

内存规整的过程是页面迁移的一种, 而内核为页面迁移提供了一个migrate_pages的接口, 被调用者只需提供需要迁移的页面集合, 要迁移到的空闲页面集合(获取及释放要迁移到的空闲页面的函数指针).
内存规整的实现即是获取这两个集合.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
/*
* migrate_pages - migrate the pages specified in a list, to the free pages
* supplied as the target for the page migration
*
* @from: The list of pages to be migrated.
* @get_new_page: The function used to allocate free pages to be used
* as the target of the page migration.
* @put_new_page: The function used to free target pages if migration
* fails, or NULL if no special handling is necessary.
* @private: Private data to be passed on to get_new_page()
* @mode: The migration mode that specifies the constraints for
* page migration, if any.
* @reason: The reason for page migration.
*
* The function returns after 10 attempts or if no pages are movable any more
* because the list has become empty or no retryable pages exist any more.
* The caller should call putback_movable_pages() to return pages to the LRU
* or free list only if ret != 0.
*
* Returns the number of pages that were not migrated, or an error code.
*/
int migrate_pages(struct list_head *from, new_page_t get_new_page,
free_page_t put_new_page, unsigned long private,
enum migrate_mode mode, int reason)

源码

直接内存整形的入口点在__alloc_pages_direct_compact, 进一步调用到try_to_compact_pages.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
/* Try memory compaction for high-order allocations before reclaim */
static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
enum compact_priority prio, enum compact_result *compa ct_result)
{
struct page *page;
unsigned long pflags;
unsigned int noreclaim_flag;

if (!order)
return NULL;

psi_memstall_enter(&pflags);
noreclaim_flag = memalloc_noreclaim_save();

*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
prio);

try_to_compact_pages遍历ac->zonlist中的每个(符合nodemask的)zone, 进行compact_zone_order.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
/**
* try_to_compact_pages - Direct compact to satisfy a high-order allocation
* @gfp_mask: The GFP mask of the current allocation
* @order: The order of the current allocation
* @alloc_flags: The allocation flags of the current allocation
* @ac: The context of current allocation
* @prio: Determines how hard direct compaction should try to succeed
*
* This is the main entry point for direct page compaction.
*/
enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
enum compact_priority prio)
{
......

/* Compact each zone in the list */
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
ac->nodemask) {
enum compact_result status;

if (prio > MIN_COMPACT_PRIORITY
&& compaction_deferred(zone, order)) {
rc = max_t(enum compact_result, COMPACT_DEFERRED, rc);
continue;
}

status = compact_zone_order(zone, order, gfp_mask, prio,
alloc_flags, ac_classzone_idx(ac));

......

这里还涉及到一个推迟规整的概念, 如果本次对该zone的内存规整失败, 会调用defer_compaction. 下次对该zone整形之前, 会调用compaction_deferred判断是否进行推迟.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
struct zone 
{
......

/* 这两个用于推迟内存碎片整理处理,只有当内存碎片整理时使用的order大于compact_order_failed才会推迟
* 只有一种情况会重置这两个值:在zone执行内存碎片整理后,从此zone中分配到了内存,会重置
*/
/* 用于判断是否需要推迟,每次推迟会++,然后判断是否超过 1UL << compact_defer_shift,超过了则要进行内存碎片整理
*/
unsigned int compact_considered;
/* 用于定量推迟计数,主要用于内存碎片整理分为compact_considered < compact_defer_shift和compact_considered >= compact_defer_shift两种情况,当次管理区的内存碎片整理成功后被置0,不会大于COMPACT_MAX_DEFER_SHIFT
* 只有在同步和轻同步模式下进行内存碎片整理后,zone的空闲页框数量没达到 (low阀值 + 1<<order + 保留内存) 时,才会增加此值
*/
unsigned int compact_defer_shift;
/*
* 表示zone内存碎片整理失败时使用的最大order值,此值会影响是否推迟内存碎片整理
* 当进行内存碎片整理时,使用的order小于此值,则允许进行内存碎片整理,否则记一次推迟
* 当内存碎片整理完成时,此值为使用的order值+1,意思是假设大一级的order在整理中会失败
* 当内存碎片整理失败时,此值则是等于order值,表示使用此大小的order值,有可能会导致失败
*/
int compact_order_failed;

......
}

/*
* Compaction is deferred when compaction fails to result in a page
* allocation success. 1 << compact_defer_limit compactions are skipped up
* to a limit of 1 << COMPACT_MAX_DEFER_SHIFT
*/
void defer_compaction(struct zone *zone, int order)
{
zone->compact_considered = 0;
zone->compact_defer_shift++;

if (order < zone->compact_order_failed)
zone->compact_order_failed = order;

if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT)
zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT;

trace_mm_compaction_defer_compaction(zone, order);
}

/* Returns true if compaction should be skipped this time */
bool compaction_deferred(struct zone *zone, int order)
{
unsigned long defer_limit = 1UL << zone->compact_defer_shift;

if (order < zone->compact_order_failed)
return false;

/* Avoid possible overflow */
if (++zone->compact_considered > defer_limit)
zone->compact_considered = defer_limit;

if (zone->compact_considered >= defer_limit)
return false;

trace_mm_compaction_deferred(zone, order);

return true;
}

compact_zone_order设置compact_control后转入compact_zone.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
static enum compact_result compact_zone_order(struct zone *zone, int order,
gfp_t gfp_mask, enum compact_priority prio,
unsigned int alloc_flags, int classzone_idx)
{
enum compact_result ret;
struct compact_control cc = {
.nr_freepages = 0,
.nr_migratepages = 0,
.total_migrate_scanned = 0,
.total_free_scanned = 0,
.order = order,
.gfp_mask = gfp_mask,
.zone = zone,
.mode = (prio == COMPACT_PRIO_ASYNC) ?
MIGRATE_ASYNC : MIGRATE_SYNC_LIGHT,
.alloc_flags = alloc_flags,
.classzone_idx = classzone_idx,
.direct_compaction = true,
.whole_zone = (prio == MIN_COMPACT_PRIORITY),
.ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY),
.ignore_block_suitable = (prio == MIN_COMPACT_PRIORITY)
};
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);

ret = compact_zone(zone, &cc);

VM_BUG_ON(!list_empty(&cc.freepages));
VM_BUG_ON(!list_empty(&cc.migratepages));

return ret;
}

compact_zone是内存规整的核心函数, 也是各内存规整路径的会合点. 归纳起来就是分两个方向扫描zone, 查找哪些页面可迁移的, 哪些页面是空闲的, 直到两个方向的扫描会合时或已经满足分配大块内存的需求时(能分配出所需要的大块内存并且满足最低的水位要求)时退出扫描.


首先调用compaction_suitable预估一下本次规整的必要性和可能性.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
/*
* compaction_suitable: Is this suitable to run compaction on this zone now?
* Returns
* COMPACT_SKIPPED - If there are too few free pages for compaction
* COMPACT_SUCCESS - If the allocation would succeed without compaction
* COMPACT_CONTINUE - If compaction should run now
*/
static enum compact_result __compaction_suitable(struct zone *zone, int order,
unsigned int alloc_flags,
int classzone_idx,
unsigned long wmark_target)

static enum compact_result compact_zone(struct zone *zone, struct compact_control *cc)
{
enum compact_result ret;
unsigned long start_pfn = zone->zone_start_pfn;
unsigned long end_pfn = zone_end_pfn(zone);
const bool sync = cc->mode != MIGRATE_ASYNC;

cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask);
ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
cc->classzone_idx);
/* Compaction is likely to fail */
if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED)
return ret;

/* huh, compaction_suitable is returning something unexpected */
VM_BUG_ON(ret != COMPACT_CONTINUE);

接下来是一个清除页块的PB_migrate_skip机制.

1
2
3
4
5
6
/*
* Clear pageblock skip if there were failures recently and compaction
* is about to be retried after being deferred.
*/
if (compaction_restarting(zone, cc->order))
__reset_isolation_suitable(zone);
  • 如果本次的order< compact_order_failed(zone内存碎片整理失败时使用的最大order值), 那么不用清除.
  • 如果zone->compact_considered(累计的推迟次数) >= 1UL << zone->compact_defer_shift (触发规整的推迟次数阈值的幂数) 并且 zone->compact_defer_shift已经达到COMPACT_MAX_DEFER_SHIFT, 那么进行清除.
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    /* Returns true if restarting compaction after many failures */
    bool compaction_restarting(struct zone *zone, int order)
    {
    if (order < zone->compact_order_failed)
    return false;

    return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT &&
    zone->compact_considered >= 1UL << zone->compact_defer_shift;
    }

然后是初始化双向扫描的位置. 如果设置了cc->whole_zone, 则完整的扫描整个zone.否则使用zone中缓存的上一次扫描结束的位置(zone->compact_cached_migrate_pfn和compact_cached_free_pfn), 当然使用之前还有合法性校验.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
/*
* Setup to move all movable pages to the end of the zone. Used cached
* information on where the scanners should start (unless we explicitly
* want to compact the whole zone), but check that it is initialised
* by ensuring the values are within zone boundaries.
*/
if (cc->whole_zone) {
cc->migrate_pfn = start_pfn;
cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
} else {
cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
cc->free_pfn = zone->compact_cached_free_pfn;
if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
zone->compact_cached_free_pfn = cc->free_pfn;
}
if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
cc->migrate_pfn = start_pfn;
zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
}

if (cc->migrate_pfn == start_pfn)
cc->whole_zone = true;
}

准备cc->migratepages

接下来是一个大循环, 不断进行内存规整直到compact_finished不再返回COMPACT_CONTINUE.其中isolate_migratepages隔离出要进行迁移的页面

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78

while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
int err;

switch (isolate_migratepages(zone, cc)) {
case ISOLATE_ABORT:
ret = COMPACT_CONTENDED;
putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
goto out;
case ISOLATE_NONE:
/*
* We haven't isolated and migrated anything, but
* there might still be unflushed migrations from
* previous cc->order aligned block.
*/
goto check_drain;
case ISOLATE_SUCCESS:
;
}

err = migrate_pages(&cc->migratepages, compaction_alloc,
compaction_free, (unsigned long)cc, cc->mode,
MR_COMPACTION);

trace_mm_compaction_migratepages(cc->nr_migratepages, err,
&cc->migratepages);

/* All pages were either migrated or will be released */
cc->nr_migratepages = 0;
if (err) {
putback_movable_pages(&cc->migratepages);
/*
* migrate_pages() may return -ENOMEM when scanners meet
* and we want compact_finished() to detect it
*/
if (err == -ENOMEM && !compact_scanners_met(cc)) {
ret = COMPACT_CONTENDED;
goto out;
}
/*
* We failed to migrate at least one page in the current
* order-aligned block, so skip the rest of it.
*/
if (cc->direct_compaction &&
(cc->mode == MIGRATE_ASYNC)) {
cc->migrate_pfn = block_end_pfn(
cc->migrate_pfn - 1, cc->order);
/* Draining pcplists is useless in this case */
cc->last_migrated_pfn = 0;

}
}

check_drain:
/*
* Has the migration scanner moved away from the previous
* cc->order aligned block where we migrated from? If yes,
* flush the pages that were freed, so that they can merge and
* compact_finished() can detect immediately if allocation
* would succeed.
*/
if (cc->order > 0 && cc->last_migrated_pfn) {
int cpu;
unsigned long current_block_start =
block_start_pfn(cc->migrate_pfn, cc->order);

if (cc->last_migrated_pfn < current_block_start) {
cpu = get_cpu();
lru_add_drain_cpu(cpu);
drain_local_pages(zone);
put_cpu();
/* No more flushing until we migrate again */
cc->last_migrated_pfn = 0;
}
}

}

扫描范围内的所有页块, 跳过被标记PB_migrate_skip(最近隔离失败)的页块和本次规整不应使用的页块(见suitable_migration_source函数), 对单个页块调用isolate_migratepages_block.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
/*
* Isolate all pages that can be migrated from the first suitable block,
* starting at the block pointed to by the migrate scanner pfn within
* compact_control.
*/
static isolate_migrate_t isolate_migratepages(struct zone *zone,
struct compact_control *cc)
{
unsigned long block_start_pfn;
unsigned long block_end_pfn;
unsigned long low_pfn;
struct page *page;
const isolate_mode_t isolate_mode =
(sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
(cc->mode != MIGRATE_SYNC ? ISOLATE_ASYNC_MIGRATE : 0);

/*
* Start at where we last stopped, or beginning of the zone as
* initialized by compact_zone()
*/
low_pfn = cc->migrate_pfn;
block_start_pfn = pageblock_start_pfn(low_pfn);
if (block_start_pfn < zone->zone_start_pfn)
block_start_pfn = zone->zone_start_pfn;

/* Only scan within a pageblock boundary */
block_end_pfn = pageblock_end_pfn(low_pfn);

/*
* Iterate over whole pageblocks until we find the first suitable.
* Do not cross the free scanner.
*/
for (; block_end_pfn <= cc->free_pfn;
low_pfn = block_end_pfn,
block_start_pfn = block_end_pfn,
block_end_pfn += pageblock_nr_pages) {

/*
* This can potentially iterate a massively long zone with
* many pageblocks unsuitable, so periodically check if we
* need to schedule, or even abort async compaction.
*/
if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
&& compact_should_abort(cc))
break;

page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
zone);
if (!page)
continue;

/* If isolation recently failed, do not retry */
if (!isolation_suitable(cc, page))
continue;

/*
* For async compaction, also only scan in MOVABLE blocks.
* Async compaction is optimistic to see if the minimum amount
* of work satisfies the allocation.
*/
if (!suitable_migration_source(cc, page))
continue;

/* Perform the isolation */
low_pfn = isolate_migratepages_block(cc, low_pfn,
block_end_pfn, isolate_mode);

if (!low_pfn || cc->contended)
return ISOLATE_ABORT;

/*
* Either we isolated something and proceed with migration. Or
* we failed and compact_zone should decide if we should
* continue or not.
*/
break;
}

/* Record where migration scanner will be restarted. */
cc->migrate_pfn = low_pfn;

return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16

static bool suitable_migration_source(struct compact_control *cc,
struct page *page)
{
int block_mt;

if ((cc->mode != MIGRATE_ASYNC) || !cc->direct_compaction)
return true;

block_mt = get_pageblock_migratetype(page);

if (cc->migratetype == MIGRATE_MOVABLE)
return is_migrate_movable(block_mt);
else
return block_mt == cc->migratetype;
}

isolate_migratepages_block进行单个页块内页面的扫描和隔离.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
/**
* isolate_migratepages_block() - isolate all migrate-able pages within
* a single pageblock
* @cc: Compaction control structure.
* @low_pfn: The first PFN to isolate
* @end_pfn: The one-past-the-last PFN to isolate, within same pageblock
* @isolate_mode: Isolation mode to be used.
*
* Isolate all pages that can be migrated from the range specified by
* [low_pfn, end_pfn). The range is expected to be within same pageblock.
* Returns zero if there is a fatal signal pending, otherwise PFN of the
* first page that was not scanned (which may be both less, equal to or more
* than end_pfn).
*
* The pages are isolated on cc->migratepages list (not required to be empty),
* and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field
* is neither read nor updated.
*/
static unsigned long
isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
unsigned long end_pfn, isolate_mode_t isolate_mode)
{
struct zone *zone = cc->zone;
unsigned long nr_scanned = 0, nr_isolated = 0;
struct lruvec *lruvec;
unsigned long flags = 0;
bool locked = false;
struct page *page = NULL, *valid_page = NULL;
unsigned long start_pfn = low_pfn;
bool skip_on_failure = false;
unsigned long next_skip_pfn = 0;

/*
* Ensure that there are not too many pages isolated from the LRU
* list by either parallel reclaimers or compaction. If there are,
* delay for some time until fewer pages are isolated
*/
while (unlikely(too_many_isolated(zone))) {
/* async migration should just abort */
if (cc->mode == MIGRATE_ASYNC)
return 0;

congestion_wait(BLK_RW_ASYNC, HZ/10);

if (fatal_signal_pending(current))
return 0;
}

if (compact_should_abort(cc))
return 0;

if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) {
skip_on_failure = true;
next_skip_pfn = block_end_pfn(low_pfn, cc->order);
}

/* Time to isolate some pages for migration */
for (; low_pfn < end_pfn; low_pfn++) {

if (skip_on_failure && low_pfn >= next_skip_pfn) {
/*
* We have isolated all migration candidates in the
* previous order-aligned block, and did not skip it due
* to failure. We should migrate the pages now and
* hopefully succeed compaction.
*/
if (nr_isolated)
break;

/*
* We failed to isolate in the previous order-aligned
* block. Set the new boundary to the end of the
* current block. Note we can't simply increase
* next_skip_pfn by 1 << order, as low_pfn might have
* been incremented by a higher number due to skipping
* a compound or a high-order buddy page in the
* previous loop iteration.
*/
next_skip_pfn = block_end_pfn(low_pfn, cc->order);
}

/*
* Periodically drop the lock (if held) regardless of its
* contention, to give chance to IRQs. Abort async compaction
* if contended.
*/
if (!(low_pfn % SWAP_CLUSTER_MAX)
&& compact_unlock_should_abort(zone_lru_lock(zone), flags,
&locked, cc))
break;

if (!pfn_valid_within(low_pfn))
goto isolate_fail;
nr_scanned++;

page = pfn_to_page(low_pfn);

if (!valid_page)
valid_page = page;

跳过空闲页面

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
/*
* Skip if free. We read page order here without zone lock
* which is generally unsafe, but the race window is small and
* the worst thing that can happen is that we skip some
* potential isolation targets.
*/
if (PageBuddy(page)) {
unsigned long freepage_order = page_order_unsafe(page);

/*
* Without lock, we cannot be sure that what we got is
* a valid page order. Consider only values in the
* valid order range to prevent low_pfn overflow.
*/
if (freepage_order > 0 && freepage_order < MAX_ORDER)
low_pfn += (1UL << freepage_order) - 1;
continue;
}

跳过混合页面

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
/*
* Regardless of being on LRU, compound pages such as THP and
* hugetlbfs are not to be compacted. We can potentially save
* a lot of iterations if we skip them at once. The check is
* racy, but we can consider only valid values and the only
* danger is skipping too much.
*/
if (PageCompound(page)) {
const unsigned int order = compound_order(page);

if (likely(order < MAX_ORDER))
low_pfn += (1UL << order) - 1;
goto isolate_fail;
}

接下来处理不在LRU链表中的页面, 这类页面一般不适合迁移, 但可移动(__PageMovable)且还没有被隔离(!PageIsolated)的页面, 是可以进行迁移的.这些是特殊的可迁移页面, 如virtio-balloon页面, isolate_movable_page会调用对应驱动程序的isolate_page方法隔离这些页面.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26

/*
* Check may be lockless but that's ok as we recheck later.
* It's possible to migrate LRU and non-lru movable pages.
* Skip any other type of page
*/
if (!PageLRU(page)) {
/*
* __PageMovable can return false positive so we need
* to verify it under page_lock.
*/
if (unlikely(__PageMovable(page)) &&
!PageIsolated(page)) {
if (locked) {
spin_unlock_irqrestore(zone_lru_lock(zone),
flags);
locked = false;
}

if (!isolate_movable_page(page, isolate_mode))
goto isolate_success;
}

goto isolate_fail;
}

固定在内存中的匿名页面也不适合迁移. !page_mapping(page)指匿名页面.page_count(page) > page_mapcount(page)说明内核中使用了这个匿名页面, 可能是通过get_user_pages对其进行锁定.

1
2
3
4
5
6
7
8
9
/*
* Migration will fail if an anonymous page is pinned in memory,
* so avoid taking lru_lock and isolating it unnecessarily in an
* admittedly racy check.
*/
if (!page_mapping(page) &&
page_count(page) > page_mapcount(page))
goto isolate_fail;

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
/*
* Only allow to migrate anonymous pages in GFP_NOFS context
* because those do not depend on fs locks.
*/
if (!(cc->gfp_mask & __GFP_FS) && page_mapping(page))
goto isolate_fail;

/* If we already hold the lock, we can skip some rechecking */
if (!locked) {
locked = compact_trylock_irqsave(zone_lru_lock(zone),
&flags, cc);
if (!locked)
break;

/* Recheck PageLRU and PageCompound under lock */
if (!PageLRU(page))
goto isolate_fail;

/*
* Page become compound since the non-locked check,
* and it's on LRU. It can only be a THP so the order
* is safe to read and it's 0 for tail pages.
*/
if (unlikely(PageCompound(page))) {
low_pfn += (1UL << compound_order(page)) - 1;
goto isolate_fail;
}
}

接下来处理LRU的页面, 简单的清除PG_LRU标志并从LRU链表中移除.

1
2
3
4
5
6
7
8
9
10
11
12
13

lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);

/* Try isolate the page */
if (__isolate_lru_page(page, isolate_mode) != 0)
goto isolate_fail;

VM_BUG_ON_PAGE(PageCompound(page), page);

/* Successfully isolated */
del_page_from_lru_list(page, lruvec, page_lru(page));
inc_node_page_state(page,
NR_ISOLATED_ANON + page_is_file_cache(page));

对于隔离成功的页面, 加入cc->migratepages

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
isolate_success:
list_add(&page->lru, &cc->migratepages);
cc->nr_migratepages++;
nr_isolated++;

/*
* Record where we could have freed pages by migration and not
* yet flushed them to buddy allocator.
* - this is the lowest page that was isolated and likely be
* then freed by migration.
*/
if (!cc->last_migrated_pfn)
cc->last_migrated_pfn = low_pfn;

/* Avoid isolating too much */
if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
++low_pfn;
break;
}

continue;

对于隔离失败的情况, 应该将隔离出的cc->migratepages重新放回.并更新compact_cached_migrate_pfn.如果扫描了整个页块却没有成功隔离出页面, 应该对页块标记PB_migrate_skip

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
isolate_fail:
if (!skip_on_failure)
continue;

/*
* We have isolated some pages, but then failed. Release them
* instead of migrating, as we cannot form the cc->order buddy
* page anyway.
*/
if (nr_isolated) {
if (locked) {
spin_unlock_irqrestore(zone_lru_lock(zone), flags);
locked = false;
}
putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
cc->last_migrated_pfn = 0;
nr_isolated = 0;
}

if (low_pfn < next_skip_pfn) {
low_pfn = next_skip_pfn - 1;
/*
* The check near the loop beginning would have updated
* next_skip_pfn too, but this is a bit simpler.
*/
next_skip_pfn += 1UL << cc->order;
}
}

/*
* The PageBuddy() check could have potentially brought us outside
* the range to be scanned.
*/
if (unlikely(low_pfn > end_pfn))
low_pfn = end_pfn;

if (locked)
spin_unlock_irqrestore(zone_lru_lock(zone), flags);

/*
* Update the pageblock-skip information and cached scanner pfn,
* if the whole pageblock was scanned without isolating any page.
*/
if (low_pfn == end_pfn)
update_pageblock_skip(cc, valid_page, nr_isolated, true);

trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn,
nr_scanned, nr_isolated);

cc->total_migrate_scanned += nr_scanned;
if (nr_isolated)
count_compact_events(COMPACTISOLATED, nr_isolated);

return low_pfn;

准备cc->free_pages

准备cc->free_pages的过程, 实际上是通过在页面迁移接口migrate_pages中回调进行的.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
/*
* This is a migrate-callback that "allocates" freepages by taking pages
* from the isolated freelists in the block we are migrating to.
*/
static struct page *compaction_alloc(struct page *migratepage,
unsigned long data)
{
struct compact_control *cc = (struct compact_control *)data;
struct page *freepage;

/*
* Isolate free pages if necessary, and if we are not aborting due to
* contention.
*/
if (list_empty(&cc->freepages)) {
if (!cc->contended)
isolate_freepages(cc);

if (list_empty(&cc->freepages))
return NULL;
}

freepage = list_entry(cc->freepages.next, struct page, lru);
list_del(&freepage->lru);
cc->nr_freepages--;

return freepage;
}

核心过程在isolate_freepages中完成, 和cc->migratepages的隔离过程类似.
首先进行扫描区间的初始化.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
/*
* Based on information in the current compact_control, find blocks
* suitable for isolating free pages from and then isolate them.
*/
static void isolate_freepages(struct compact_control *cc)
{
struct zone *zone = cc->zone;
struct page *page;
unsigned long block_start_pfn; /* start of current pageblock */
unsigned long isolate_start_pfn; /* exact pfn we start at */
unsigned long block_end_pfn; /* end of current pageblock */
unsigned long low_pfn; /* lowest pfn scanner is able to scan */
struct list_head *freelist = &cc->freepages;

/*
* Initialise the free scanner. The starting point is where we last
* successfully isolated from, zone-cached value, or the end of the
* zone when isolating for the first time. For looping we also need
* this pfn aligned down to the pageblock boundary, because we do
* block_start_pfn -= pageblock_nr_pages in the for loop.
* For ending point, take care when isolating in last pageblock of a
* a zone which ends in the middle of a pageblock.
* The low boundary is the end of the pageblock the migration scanner
* is using.
*/
isolate_start_pfn = cc->free_pfn;
block_start_pfn = pageblock_start_pfn(cc->free_pfn);
block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
zone_end_pfn(zone));
low_pfn = pageblock_end_pfn(cc->migrate_pfn);

然后进入从zone末尾开始向zone首部循环扫描的过程.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
/*
* Isolate free pages until enough are available to migrate the
* pages on cc->migratepages. We stop searching if the migrate
* and free page scanners meet or enough free pages are isolated.
*/
for (; block_start_pfn >= low_pfn;
block_end_pfn = block_start_pfn,
block_start_pfn -= pageblock_nr_pages,
isolate_start_pfn = block_start_pfn) {
/*
* This can iterate a massively long zone without finding any
* suitable migration targets, so periodically check if we need
* to schedule, or even abort async compaction.
*/
if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
&& compact_should_abort(cc))
break;

page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
zone);
if (!page)
continue;

检查该页块是否适合成为迁移的目标.

1
2
3
/* Check the block is suitable for migration */
if (!suitable_migration_target(cc, page))
continue;

当且仅当 ( (该页面不是空闲) || (是空闲但页面大小小于页块大小) ) && (迁移类型为MIGRATE_MOVABLE或MIGRATE_CMA)时, 页面适合成为迁移目标.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
/* Returns true if the page is within a block suitable for migration to */
static bool suitable_migration_target(struct compact_control *cc,
struct page *page)
{
/* If the page is a large free page, then disallow migration */
if (PageBuddy(page)) {
/*
* We are checking page_order without zone->lock taken. But
* the only small danger is that we skip a potentially suitable
* pageblock, so it's not worth to check order for valid range.
*/
if (page_order_unsafe(page) >= pageblock_order)
return false;
}

if (cc->ignore_block_suitable)
return true;

/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
if (is_migrate_movable(get_pageblock_migratetype(page)))
return true;

/* Otherwise skip the block */
return false;
}

如果该页块近期隔离失败过(被标记PB_migrate_skip), 跳过该页块.

1
2
3
/* If isolation recently failed, do not retry */
if (!isolation_suitable(cc, page))
continue;

然后调用isolate_freepages_block从该页块中隔离出空闲页面.

1
2
3
/* Found a block suitable for isolating free pages from. */
isolate_freepages_block(cc, &isolate_start_pfn, block_end_pfn,
freelist, false);

循环上述过程直到获取到足够的freepages或扫描结束.map_pages将freelist中的高阶空闲页面拆成order-0的页面.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
	if ((cc->nr_freepages >= cc->nr_migratepages)
|| cc->contended) {
if (isolate_start_pfn >= block_end_pfn) {
/*
* Restart at previous pageblock if more
* freepages can be isolated next time.
*/
isolate_start_pfn =
block_start_pfn - pageblock_nr_pages;
}
break;
} else if (isolate_start_pfn < block_end_pfn) {
/*
* If isolation failed early, do not continue
* needlessly.
*/
break;
}
}
/* __isolate_free_page() does not map the pages */
map_pages(freelist);

/*
* Record where the free scanner will restart next time. Either we
* broke from the loop and set isolate_start_pfn based on the last
* call to isolate_freepages_block(), or we met the migration scanner
* and the loop terminated due to isolate_start_pfn < low_pfn
*/
cc->free_pfn = isolate_start_pfn;

isolate_freepages_block的过程与isolate_migratepages_block很相似, 不再阐述.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
/*
* Isolate free pages onto a private freelist. If @strict is true, will abort
* returning 0 on any invalid PFNs or non-free pages inside of the pageblock
* (even though it may still end up isolating some pages).
*/
static unsigned long isolate_freepages_block(struct compact_control *cc,
unsigned long *start_pfn,
unsigned long end_pfn,
struct list_head *freelist,
bool strict)
{
int nr_scanned = 0, total_isolated = 0;
struct page *cursor, *valid_page = NULL;
unsigned long flags = 0;
bool locked = false;
unsigned long blockpfn = *start_pfn;
unsigned int order;

cursor = pfn_to_page(blockpfn);

/* Isolate free pages. */
for (; blockpfn < end_pfn; blockpfn++, cursor++) {
int isolated;
struct page *page = cursor;

/*
* Periodically drop the lock (if held) regardless of its
* contention, to give chance to IRQs. Abort if fatal signal
* pending or async compaction detects need_resched()
*/
if (!(blockpfn % SWAP_CLUSTER_MAX)
&& compact_unlock_should_abort(&cc->zone->lock, flags,
&locked, cc))
break;

nr_scanned++;
if (!pfn_valid_within(blockpfn))
goto isolate_fail;

if (!valid_page)
valid_page = page;

/*
* For compound pages such as THP and hugetlbfs, we can save
* potentially a lot of iterations if we skip them at once.
* The check is racy, but we can consider only valid values
* and the only danger is skipping too much.
*/
if (PageCompound(page)) {
const unsigned int order = compound_order(page);

if (likely(order < MAX_ORDER)) {
blockpfn += (1UL << order) - 1;
cursor += (1UL << order) - 1;
}
goto isolate_fail;
}

if (!PageBuddy(page))
goto isolate_fail;

/*
* If we already hold the lock, we can skip some rechecking.
* Note that if we hold the lock now, checked_pageblock was
* already set in some previous iteration (or strict is true),
* so it is correct to skip the suitable migration target
* recheck as well.
*/
if (!locked) {
/*
* The zone lock must be held to isolate freepages.
* Unfortunately this is a very coarse lock and can be
* heavily contended if there are parallel allocations
* or parallel compactions. For async compaction do not
* spin on the lock and we acquire the lock as late as
* possible.
*/
locked = compact_trylock_irqsave(&cc->zone->lock,
&flags, cc);
if (!locked)
break;

/* Recheck this is a buddy page under lock */
if (!PageBuddy(page))
goto isolate_fail;
}

/* Found a free page, will break it into order-0 pages */
order = page_order(page);
isolated = __isolate_free_page(page, order);
if (!isolated)
break;
set_page_private(page, order);

total_isolated += isolated;
cc->nr_freepages += isolated;
list_add_tail(&page->lru, freelist);

if (!strict && cc->nr_migratepages <= cc->nr_freepages) {
blockpfn += isolated;
break;
}
/* Advance to the end of split page */
blockpfn += isolated - 1;
cursor += isolated - 1;
continue;

isolate_fail:
if (strict)
break;
else
continue;

}

if (locked)
spin_unlock_irqrestore(&cc->zone->lock, flags);

/*
* There is a tiny chance that we have read bogus compound_order(),
* so be careful to not go outside of the pageblock.
*/
if (unlikely(blockpfn > end_pfn))
blockpfn = end_pfn;

trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn,
nr_scanned, total_isolated);

/* Record how far we have got within the block */
*start_pfn = blockpfn;

/*
* If strict isolation is requested by CMA then check that all the
* pages requested were isolated. If there were any failures, 0 is
* returned and CMA will fail.
*/
if (strict && blockpfn < end_pfn)
total_isolated = 0;

/* Update the pageblock-skip if the whole pageblock was scanned */
if (blockpfn == end_pfn)
update_pageblock_skip(cc, valid_page, total_isolated, false);

cc->total_free_scanned += nr_scanned;
if (total_isolated)
count_compact_events(COMPACTISOLATED, total_isolated);
return total_isolated;
}

参考文章

https://www.cnblogs.com/tolimit/p/5286663.html

  • 版权声明: 本博客所有文章除特别声明外,著作权归作者所有。转载请注明出处!
  • Copyrights © 2022-2024 翰青HanQi

请我喝杯咖啡吧~

支付宝
微信