Linux内核内存规整 -- ARM64 v5.0

2024-09-29

字数统计: 5.4k | 阅读时长≈ 29 分钟

Question

内存规整的目的?
内存规整和页面迁移的区别?
为什么将页面划分不同的迁移类型?
页面规整的流程?

概述

系统长时间运行后, 页面变得越来越分散, 分配一大块连续的物理内存变得越来越困难, 而有时又需要分配大块连续物理内存, 所以需要解决内存碎片化的问题, 这一过程叫做内存规整(内存紧缩).

内存去碎片化的基本原理是按照页面的可移动性将页面分组. 迁移内核本身使用的物理内存的实现难度和复杂度都很大, 因此目前的内核不迁移内核本身使用的物理页面. 对于用户进程使用的页面, 实际上是通过用户页表的映射访问的, 用户页表移动和修改映射关系不会影响到用户进程, 因此内存规整是基于页面迁移实现的.

内存页面被划分为可移动, 可回收, 不可移动等迁移类型. 可移动的页面通常是指用户态进程分配的内存, 移动这些页面仅仅需要修改页表映射关系. 可回收的页面是指不可以移动但可以释放的页面.

内存规整的过程是页面迁移的一种, 而内核为页面迁移提供了一个migrate_pages的接口, 被调用者只需提供需要迁移的页面集合, 要迁移到的空闲页面集合(获取及释放要迁移到的空闲页面的函数指针).
内存规整的实现即是获取这两个集合.

/*
 * migrate_pages - migrate the pages specified in a list, to the free pages
 *		   supplied as the target for the page migration
 *
 * @from:		The list of pages to be migrated.
 * @get_new_page:	The function used to allocate free pages to be used
 *			as the target of the page migration.
 * @put_new_page:	The function used to free target pages if migration
 *			fails, or NULL if no special handling is necessary.
 * @private:		Private data to be passed on to get_new_page()
 * @mode:		The migration mode that specifies the constraints for
 *			page migration, if any.
 * @reason:		The reason for page migration.
 *
 * The function returns after 10 attempts or if no pages are movable any more
 * because the list has become empty or no retryable pages exist any more.
 * The caller should call putback_movable_pages() to return pages to the LRU
 * or free list only if ret != 0.
 *
 * Returns the number of pages that were not migrated, or an error code.
 */
int migrate_pages(struct list_head *from, new_page_t get_new_page,
		free_page_t put_new_page, unsigned long private,
		enum migrate_mode mode, int reason)

源码

直接内存整形的入口点在__alloc_pages_direct_compact, 进一步调用到try_to_compact_pages.

/* Try memory compaction for high-order allocations before reclaim */
static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
		unsigned int alloc_flags, const struct alloc_context *ac,
		enum compact_priority prio, enum compact_result *compa ct_result)
{
	struct page *page;
	unsigned long pflags;
	unsigned int noreclaim_flag;

	if (!order)
		return NULL;

	psi_memstall_enter(&pflags);
	noreclaim_flag = memalloc_noreclaim_save();

	*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
									prio);

try_to_compact_pages遍历ac->zonlist中的每个(符合nodemask的)zone, 进行compact_zone_order.

/**
 * try_to_compact_pages - Direct compact to satisfy a high-order allocation
 * @gfp_mask: The GFP mask of the current allocation
 * @order: The order of the current allocation
 * @alloc_flags: The allocation flags of the current allocation
 * @ac: The context of current allocation
 * @prio: Determines how hard direct compaction should try to succeed
 *
 * This is the main entry point for direct page compaction.
 */
enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
		unsigned int alloc_flags, const struct alloc_context *ac,
		enum compact_priority prio)
{
......

	/* Compact each zone in the list */
	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
								ac->nodemask) {
		enum compact_result status;

		if (prio > MIN_COMPACT_PRIORITY
					&& compaction_deferred(zone, order)) {
			rc = max_t(enum compact_result, COMPACT_DEFERRED, rc);
			continue;
		}

		status = compact_zone_order(zone, order, gfp_mask, prio,
					alloc_flags, ac_classzone_idx(ac));

......

这里还涉及到一个推迟规整的概念, 如果本次对该zone的内存规整失败, 会调用defer_compaction. 下次对该zone整形之前, 会调用compaction_deferred判断是否进行推迟.

struct zone 
{
    ......

    /* 这两个用于推迟内存碎片整理处理，只有当内存碎片整理时使用的order大于compact_order_failed才会推迟 
     * 只有一种情况会重置这两个值:在zone执行内存碎片整理后，从此zone中分配到了内存，会重置
     */
    /* 用于判断是否需要推迟，每次推迟会++，然后判断是否超过 1UL << compact_defer_shift，超过了则要进行内存碎片整理
     */
    unsigned int        compact_considered;
    /* 用于定量推迟计数，主要用于内存碎片整理分为compact_considered < compact_defer_shift和compact_considered >= compact_defer_shift两种情况，当次管理区的内存碎片整理成功后被置0，不会大于COMPACT_MAX_DEFER_SHIFT
     * 只有在同步和轻同步模式下进行内存碎片整理后，zone的空闲页框数量没达到 (low阀值 + 1<<order + 保留内存) 时，才会增加此值
     */
    unsigned int        compact_defer_shift;
    /* 
     * 表示zone内存碎片整理失败时使用的最大order值，此值会影响是否推迟内存碎片整理
     * 当进行内存碎片整理时，使用的order小于此值，则允许进行内存碎片整理，否则记一次推迟
     * 当内存碎片整理完成时，此值为使用的order值+1，意思是假设大一级的order在整理中会失败
     * 当内存碎片整理失败时，此值则是等于order值，表示使用此大小的order值，有可能会导致失败
     */
    int            compact_order_failed;

    ......
}

/*
 * Compaction is deferred when compaction fails to result in a page
 * allocation success. 1 << compact_defer_limit compactions are skipped up
 * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT
 */
void defer_compaction(struct zone *zone, int order)
{
	zone->compact_considered = 0;
	zone->compact_defer_shift++;

	if (order < zone->compact_order_failed)
		zone->compact_order_failed = order;

	if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT)
		zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT;

	trace_mm_compaction_defer_compaction(zone, order);
}

/* Returns true if compaction should be skipped this time */
bool compaction_deferred(struct zone *zone, int order)
{
	unsigned long defer_limit = 1UL << zone->compact_defer_shift;

	if (order < zone->compact_order_failed)
		return false;

	/* Avoid possible overflow */
	if (++zone->compact_considered > defer_limit)
		zone->compact_considered = defer_limit;

	if (zone->compact_considered >= defer_limit)
		return false;

	trace_mm_compaction_deferred(zone, order);

	return true;
}

compact_zone_order设置compact_control后转入compact_zone.

static enum compact_result compact_zone_order(struct zone *zone, int order,
		gfp_t gfp_mask, enum compact_priority prio,
		unsigned int alloc_flags, int classzone_idx)
{
	enum compact_result ret;
	struct compact_control cc = {
		.nr_freepages = 0,
		.nr_migratepages = 0,
		.total_migrate_scanned = 0,
		.total_free_scanned = 0,
		.order = order,
		.gfp_mask = gfp_mask,
		.zone = zone,
		.mode = (prio == COMPACT_PRIO_ASYNC) ?
					MIGRATE_ASYNC :	MIGRATE_SYNC_LIGHT,
		.alloc_flags = alloc_flags,
		.classzone_idx = classzone_idx,
		.direct_compaction = true,
		.whole_zone = (prio == MIN_COMPACT_PRIORITY),
		.ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY),
		.ignore_block_suitable = (prio == MIN_COMPACT_PRIORITY)
	};
	INIT_LIST_HEAD(&cc.freepages);
	INIT_LIST_HEAD(&cc.migratepages);

	ret = compact_zone(zone, &cc);

	VM_BUG_ON(!list_empty(&cc.freepages));
	VM_BUG_ON(!list_empty(&cc.migratepages));

	return ret;
}

compact_zone是内存规整的核心函数, 也是各内存规整路径的会合点. 归纳起来就是分两个方向扫描zone, 查找哪些页面可迁移的, 哪些页面是空闲的, 直到两个方向的扫描会合时或已经满足分配大块内存的需求时(能分配出所需要的大块内存并且满足最低的水位要求)时退出扫描.

首先调用compaction_suitable预估一下本次规整的必要性和可能性.

/*
 * compaction_suitable: Is this suitable to run compaction on this zone now?
 * Returns
 *   COMPACT_SKIPPED  - If there are too few free pages for compaction
 *   COMPACT_SUCCESS  - If the allocation would succeed without compaction
 *   COMPACT_CONTINUE - If compaction should run now
 */
static enum compact_result __compaction_suitable(struct zone *zone, int order,
					unsigned int alloc_flags,
					int classzone_idx,
					unsigned long wmark_target)

static enum compact_result compact_zone(struct zone *zone, struct compact_control *cc)
{
	enum compact_result ret;
	unsigned long start_pfn = zone->zone_start_pfn;
	unsigned long end_pfn = zone_end_pfn(zone);
	const bool sync = cc->mode != MIGRATE_ASYNC;

	cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask);
	ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
							cc->classzone_idx);
	/* Compaction is likely to fail */
	if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED)
		return ret;

	/* huh, compaction_suitable is returning something unexpected */
	VM_BUG_ON(ret != COMPACT_CONTINUE);

接下来是一个清除页块的PB_migrate_skip机制.

/*
 * Clear pageblock skip if there were failures recently and compaction
 * is about to be retried after being deferred.
 */
if (compaction_restarting(zone, cc->order))
	__reset_isolation_suitable(zone);

如果本次的order< compact_order_failed(zone内存碎片整理失败时使用的最大order值), 那么不用清除.

如果zone->compact_considered(累计的推迟次数) >= 1UL << zone->compact_defer_shift (触发规整的推迟次数阈值的幂数) 并且 zone->compact_defer_shift已经达到COMPACT_MAX_DEFER_SHIFT, 那么进行清除.

/* Returns true if restarting compaction after many failures */
bool compaction_restarting(struct zone *zone, int order)
{
	if (order < zone->compact_order_failed)
		return false;

	return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT &&
		zone->compact_considered >= 1UL << zone->compact_defer_shift;
}

然后是初始化双向扫描的位置. 如果设置了cc->whole_zone, 则完整的扫描整个zone.否则使用zone中缓存的上一次扫描结束的位置(zone->compact_cached_migrate_pfn和compact_cached_free_pfn), 当然使用之前还有合法性校验.

/*
 * Setup to move all movable pages to the end of the zone. Used cached
 * information on where the scanners should start (unless we explicitly
 * want to compact the whole zone), but check that it is initialised
 * by ensuring the values are within zone boundaries.
 */
if (cc->whole_zone) {
	cc->migrate_pfn = start_pfn;
	cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
} else {
	cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
	cc->free_pfn = zone->compact_cached_free_pfn;
	if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
		cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
		zone->compact_cached_free_pfn = cc->free_pfn;
	}
	if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
		cc->migrate_pfn = start_pfn;
		zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
		zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
	}

	if (cc->migrate_pfn == start_pfn)
		cc->whole_zone = true;
}

准备cc->migratepages

接下来是一个大循环, 不断进行内存规整直到compact_finished不再返回COMPACT_CONTINUE.其中isolate_migratepages隔离出要进行迁移的页面


	while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
		int err;

		switch (isolate_migratepages(zone, cc)) {
		case ISOLATE_ABORT:
			ret = COMPACT_CONTENDED;
			putback_movable_pages(&cc->migratepages);
			cc->nr_migratepages = 0;
			goto out;
		case ISOLATE_NONE:
			/*
			 * We haven't isolated and migrated anything, but
			 * there might still be unflushed migrations from
			 * previous cc->order aligned block.
			 */
			goto check_drain;
		case ISOLATE_SUCCESS:
			;
		}

		err = migrate_pages(&cc->migratepages, compaction_alloc,
				compaction_free, (unsigned long)cc, cc->mode,
				MR_COMPACTION);

		trace_mm_compaction_migratepages(cc->nr_migratepages, err,
							&cc->migratepages);

		/* All pages were either migrated or will be released */
		cc->nr_migratepages = 0;
		if (err) {
			putback_movable_pages(&cc->migratepages);
			/*
			 * migrate_pages() may return -ENOMEM when scanners meet
			 * and we want compact_finished() to detect it
			 */
			if (err == -ENOMEM && !compact_scanners_met(cc)) {
				ret = COMPACT_CONTENDED;
				goto out;
			}
			/*
			 * We failed to migrate at least one page in the current
			 * order-aligned block, so skip the rest of it.
			 */
			if (cc->direct_compaction &&
						(cc->mode == MIGRATE_ASYNC)) {
				cc->migrate_pfn = block_end_pfn(
						cc->migrate_pfn - 1, cc->order);
				/* Draining pcplists is useless in this case */
				cc->last_migrated_pfn = 0;

			}
		}

check_drain:
		/*
		 * Has the migration scanner moved away from the previous
		 * cc->order aligned block where we migrated from? If yes,
		 * flush the pages that were freed, so that they can merge and
		 * compact_finished() can detect immediately if allocation
		 * would succeed.
		 */
		if (cc->order > 0 && cc->last_migrated_pfn) {
			int cpu;
			unsigned long current_block_start =
				block_start_pfn(cc->migrate_pfn, cc->order);

			if (cc->last_migrated_pfn < current_block_start) {
				cpu = get_cpu();
				lru_add_drain_cpu(cpu);
				drain_local_pages(zone);
				put_cpu();
				/* No more flushing until we migrate again */
				cc->last_migrated_pfn = 0;
			}
		}

	}

扫描范围内的所有页块, 跳过被标记PB_migrate_skip(最近隔离失败)的页块和本次规整不应使用的页块(见suitable_migration_source函数), 对单个页块调用isolate_migratepages_block.

/*
 * Isolate all pages that can be migrated from the first suitable block,
 * starting at the block pointed to by the migrate scanner pfn within
 * compact_control.
 */
static isolate_migrate_t isolate_migratepages(struct zone *zone,
					struct compact_control *cc)
{
	unsigned long block_start_pfn;
	unsigned long block_end_pfn;
	unsigned long low_pfn;
	struct page *page;
	const isolate_mode_t isolate_mode =
		(sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
		(cc->mode != MIGRATE_SYNC ? ISOLATE_ASYNC_MIGRATE : 0);

	/*
	 * Start at where we last stopped, or beginning of the zone as
	 * initialized by compact_zone()
	 */
	low_pfn = cc->migrate_pfn;
	block_start_pfn = pageblock_start_pfn(low_pfn);
	if (block_start_pfn < zone->zone_start_pfn)
		block_start_pfn = zone->zone_start_pfn;

	/* Only scan within a pageblock boundary */
	block_end_pfn = pageblock_end_pfn(low_pfn);

	/*
	 * Iterate over whole pageblocks until we find the first suitable.
	 * Do not cross the free scanner.
	 */
	for (; block_end_pfn <= cc->free_pfn;
			low_pfn = block_end_pfn,
			block_start_pfn = block_end_pfn,
			block_end_pfn += pageblock_nr_pages) {

		/*
		 * This can potentially iterate a massively long zone with
		 * many pageblocks unsuitable, so periodically check if we
		 * need to schedule, or even abort async compaction.
		 */
		if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
						&& compact_should_abort(cc))
			break;

		page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
									zone);
		if (!page)
			continue;

		/* If isolation recently failed, do not retry */
		if (!isolation_suitable(cc, page))
			continue;

		/*
		 * For async compaction, also only scan in MOVABLE blocks.
		 * Async compaction is optimistic to see if the minimum amount
		 * of work satisfies the allocation.
		 */
		if (!suitable_migration_source(cc, page))
			continue;

		/* Perform the isolation */
		low_pfn = isolate_migratepages_block(cc, low_pfn,
						block_end_pfn, isolate_mode);

		if (!low_pfn || cc->contended)
			return ISOLATE_ABORT;

		/*
		 * Either we isolated something and proceed with migration. Or
		 * we failed and compact_zone should decide if we should
		 * continue or not.
		 */
		break;
	}

	/* Record where migration scanner will be restarted. */
	cc->migrate_pfn = low_pfn;

	return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
}


static bool suitable_migration_source(struct compact_control *cc,
							struct page *page)
{
	int block_mt;

	if ((cc->mode != MIGRATE_ASYNC) || !cc->direct_compaction)
		return true;

	block_mt = get_pageblock_migratetype(page);

	if (cc->migratetype == MIGRATE_MOVABLE)
		return is_migrate_movable(block_mt);
	else
		return block_mt == cc->migratetype;
}

isolate_migratepages_block进行单个页块内页面的扫描和隔离.

/**
 * isolate_migratepages_block() - isolate all migrate-able pages within
 *				  a single pageblock
 * @cc:		Compaction control structure.
 * @low_pfn:	The first PFN to isolate
 * @end_pfn:	The one-past-the-last PFN to isolate, within same pageblock
 * @isolate_mode: Isolation mode to be used.
 *
 * Isolate all pages that can be migrated from the range specified by
 * [low_pfn, end_pfn). The range is expected to be within same pageblock.
 * Returns zero if there is a fatal signal pending, otherwise PFN of the
 * first page that was not scanned (which may be both less, equal to or more
 * than end_pfn).
 *
 * The pages are isolated on cc->migratepages list (not required to be empty),
 * and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field
 * is neither read nor updated.
 */
static unsigned long
isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
			unsigned long end_pfn, isolate_mode_t isolate_mode)
{
	struct zone *zone = cc->zone;
	unsigned long nr_scanned = 0, nr_isolated = 0;
	struct lruvec *lruvec;
	unsigned long flags = 0;
	bool locked = false;
	struct page *page = NULL, *valid_page = NULL;
	unsigned long start_pfn = low_pfn;
	bool skip_on_failure = false;
	unsigned long next_skip_pfn = 0;

	/*
	 * Ensure that there are not too many pages isolated from the LRU
	 * list by either parallel reclaimers or compaction. If there are,
	 * delay for some time until fewer pages are isolated
	 */
	while (unlikely(too_many_isolated(zone))) {
		/* async migration should just abort */
		if (cc->mode == MIGRATE_ASYNC)
			return 0;

		congestion_wait(BLK_RW_ASYNC, HZ/10);

		if (fatal_signal_pending(current))
			return 0;
	}

	if (compact_should_abort(cc))
		return 0;

	if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) {
		skip_on_failure = true;
		next_skip_pfn = block_end_pfn(low_pfn, cc->order);
	}

	/* Time to isolate some pages for migration */
	for (; low_pfn < end_pfn; low_pfn++) {

		if (skip_on_failure && low_pfn >= next_skip_pfn) {
			/*
			 * We have isolated all migration candidates in the
			 * previous order-aligned block, and did not skip it due
			 * to failure. We should migrate the pages now and
			 * hopefully succeed compaction.
			 */
			if (nr_isolated)
				break;

			/*
			 * We failed to isolate in the previous order-aligned
			 * block. Set the new boundary to the end of the
			 * current block. Note we can't simply increase
			 * next_skip_pfn by 1 << order, as low_pfn might have
			 * been incremented by a higher number due to skipping
			 * a compound or a high-order buddy page in the
			 * previous loop iteration.
			 */
			next_skip_pfn = block_end_pfn(low_pfn, cc->order);
		}

		/*
		 * Periodically drop the lock (if held) regardless of its
		 * contention, to give chance to IRQs. Abort async compaction
		 * if contended.
		 */
		if (!(low_pfn % SWAP_CLUSTER_MAX)
		    && compact_unlock_should_abort(zone_lru_lock(zone), flags,
								&locked, cc))
			break;

		if (!pfn_valid_within(low_pfn))
			goto isolate_fail;
		nr_scanned++;

		page = pfn_to_page(low_pfn);

		if (!valid_page)
			valid_page = page;

跳过空闲页面

/*
 * Skip if free. We read page order here without zone lock
 * which is generally unsafe, but the race window is small and
 * the worst thing that can happen is that we skip some
 * potential isolation targets.
 */
if (PageBuddy(page)) {
	unsigned long freepage_order = page_order_unsafe(page);

	/*
	 * Without lock, we cannot be sure that what we got is
	 * a valid page order. Consider only values in the
	 * valid order range to prevent low_pfn overflow.
	 */
	if (freepage_order > 0 && freepage_order < MAX_ORDER)
		low_pfn += (1UL << freepage_order) - 1;
	continue;
}

跳过混合页面

/*
 * Regardless of being on LRU, compound pages such as THP and
 * hugetlbfs are not to be compacted. We can potentially save
 * a lot of iterations if we skip them at once. The check is
 * racy, but we can consider only valid values and the only
 * danger is skipping too much.
 */
if (PageCompound(page)) {
	const unsigned int order = compound_order(page);

	if (likely(order < MAX_ORDER))
		low_pfn += (1UL << order) - 1;
	goto isolate_fail;
}

接下来处理不在LRU链表中的页面, 这类页面一般不适合迁移, 但可移动(__PageMovable)且还没有被隔离(!PageIsolated)的页面, 是可以进行迁移的.这些是特殊的可迁移页面, 如virtio-balloon页面, isolate_movable_page会调用对应驱动程序的isolate_page方法隔离这些页面.


/*
 * Check may be lockless but that's ok as we recheck later.
 * It's possible to migrate LRU and non-lru movable pages.
 * Skip any other type of page
 */
if (!PageLRU(page)) {
	/*
	 * __PageMovable can return false positive so we need
	 * to verify it under page_lock.
	 */
	if (unlikely(__PageMovable(page)) &&
			!PageIsolated(page)) {
		if (locked) {
			spin_unlock_irqrestore(zone_lru_lock(zone),
							flags);
			locked = false;
		}

		if (!isolate_movable_page(page, isolate_mode))
			goto isolate_success;
	}

	goto isolate_fail;
}

固定在内存中的匿名页面也不适合迁移. !page_mapping(page)指匿名页面.page_count(page) > page_mapcount(page)说明内核中使用了这个匿名页面, 可能是通过get_user_pages对其进行锁定.

/*
 * Migration will fail if an anonymous page is pinned in memory,
 * so avoid taking lru_lock and isolating it unnecessarily in an
 * admittedly racy check.
 */
if (!page_mapping(page) &&
    page_count(page) > page_mapcount(page))
	goto isolate_fail;

/*
 * Only allow to migrate anonymous pages in GFP_NOFS context
 * because those do not depend on fs locks.
 */
if (!(cc->gfp_mask & __GFP_FS) && page_mapping(page))
	goto isolate_fail;

/* If we already hold the lock, we can skip some rechecking */
if (!locked) {
	locked = compact_trylock_irqsave(zone_lru_lock(zone),
						&flags, cc);
	if (!locked)
		break;

	/* Recheck PageLRU and PageCompound under lock */
	if (!PageLRU(page))
		goto isolate_fail;

	/*
	 * Page become compound since the non-locked check,
	 * and it's on LRU. It can only be a THP so the order
	 * is safe to read and it's 0 for tail pages.
	 */
	if (unlikely(PageCompound(page))) {
		low_pfn += (1UL << compound_order(page)) - 1;
		goto isolate_fail;
	}
}

接下来处理LRU的页面, 简单的清除PG_LRU标志并从LRU链表中移除.


lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);

/* Try isolate the page */
if (__isolate_lru_page(page, isolate_mode) != 0)
	goto isolate_fail;

VM_BUG_ON_PAGE(PageCompound(page), page);

/* Successfully isolated */
del_page_from_lru_list(page, lruvec, page_lru(page));
inc_node_page_state(page,
		NR_ISOLATED_ANON + page_is_file_cache(page));

对于隔离成功的页面, 加入cc->migratepages

isolate_success:
		list_add(&page->lru, &cc->migratepages);
		cc->nr_migratepages++;
		nr_isolated++;

		/*
		 * Record where we could have freed pages by migration and not
		 * yet flushed them to buddy allocator.
		 * - this is the lowest page that was isolated and likely be
		 * then freed by migration.
		 */
		if (!cc->last_migrated_pfn)
			cc->last_migrated_pfn = low_pfn;

		/* Avoid isolating too much */
		if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
			++low_pfn;
			break;
		}

		continue;

对于隔离失败的情况, 应该将隔离出的cc->migratepages重新放回.并更新compact_cached_migrate_pfn.如果扫描了整个页块却没有成功隔离出页面, 应该对页块标记PB_migrate_skip

isolate_fail:
		if (!skip_on_failure)
			continue;

		/*
		 * We have isolated some pages, but then failed. Release them
		 * instead of migrating, as we cannot form the cc->order buddy
		 * page anyway.
		 */
		if (nr_isolated) {
			if (locked) {
				spin_unlock_irqrestore(zone_lru_lock(zone), flags);
				locked = false;
			}
			putback_movable_pages(&cc->migratepages);
			cc->nr_migratepages = 0;
			cc->last_migrated_pfn = 0;
			nr_isolated = 0;
		}

		if (low_pfn < next_skip_pfn) {
			low_pfn = next_skip_pfn - 1;
			/*
			 * The check near the loop beginning would have updated
			 * next_skip_pfn too, but this is a bit simpler.
			 */
			next_skip_pfn += 1UL << cc->order;
		}
	}

	/*
	 * The PageBuddy() check could have potentially brought us outside
	 * the range to be scanned.
	 */
	if (unlikely(low_pfn > end_pfn))
		low_pfn = end_pfn;

	if (locked)
		spin_unlock_irqrestore(zone_lru_lock(zone), flags);

	/*
	 * Update the pageblock-skip information and cached scanner pfn,
	 * if the whole pageblock was scanned without isolating any page.
	 */
	if (low_pfn == end_pfn)
		update_pageblock_skip(cc, valid_page, nr_isolated, true);

	trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn,
						nr_scanned, nr_isolated);

	cc->total_migrate_scanned += nr_scanned;
	if (nr_isolated)
		count_compact_events(COMPACTISOLATED, nr_isolated);

	return low_pfn;

准备cc->free_pages

准备cc->free_pages的过程, 实际上是通过在页面迁移接口migrate_pages中回调进行的.

/*
 * This is a migrate-callback that "allocates" freepages by taking pages
 * from the isolated freelists in the block we are migrating to.
 */
static struct page *compaction_alloc(struct page *migratepage,
					unsigned long data)
{
	struct compact_control *cc = (struct compact_control *)data;
	struct page *freepage;

	/*
	 * Isolate free pages if necessary, and if we are not aborting due to
	 * contention.
	 */
	if (list_empty(&cc->freepages)) {
		if (!cc->contended)
			isolate_freepages(cc);

		if (list_empty(&cc->freepages))
			return NULL;
	}

	freepage = list_entry(cc->freepages.next, struct page, lru);
	list_del(&freepage->lru);
	cc->nr_freepages--;

	return freepage;
}

核心过程在isolate_freepages中完成, 和cc->migratepages的隔离过程类似.
首先进行扫描区间的初始化.

/*
 * Based on information in the current compact_control, find blocks
 * suitable for isolating free pages from and then isolate them.
 */
static void isolate_freepages(struct compact_control *cc)
{
	struct zone *zone = cc->zone;
	struct page *page;
	unsigned long block_start_pfn;	/* start of current pageblock */
	unsigned long isolate_start_pfn; /* exact pfn we start at */
	unsigned long block_end_pfn;	/* end of current pageblock */
	unsigned long low_pfn;	     /* lowest pfn scanner is able to scan */
	struct list_head *freelist = &cc->freepages;

	/*
	 * Initialise the free scanner. The starting point is where we last
	 * successfully isolated from, zone-cached value, or the end of the
	 * zone when isolating for the first time. For looping we also need
	 * this pfn aligned down to the pageblock boundary, because we do
	 * block_start_pfn -= pageblock_nr_pages in the for loop.
	 * For ending point, take care when isolating in last pageblock of a
	 * a zone which ends in the middle of a pageblock.
	 * The low boundary is the end of the pageblock the migration scanner
	 * is using.
	 */
	isolate_start_pfn = cc->free_pfn;
	block_start_pfn = pageblock_start_pfn(cc->free_pfn);
	block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
						zone_end_pfn(zone));
	low_pfn = pageblock_end_pfn(cc->migrate_pfn);

然后进入从zone末尾开始向zone首部循环扫描的过程.

/*
 * Isolate free pages until enough are available to migrate the
 * pages on cc->migratepages. We stop searching if the migrate
 * and free page scanners meet or enough free pages are isolated.
 */
for (; block_start_pfn >= low_pfn;
			block_end_pfn = block_start_pfn,
			block_start_pfn -= pageblock_nr_pages,
			isolate_start_pfn = block_start_pfn) {
	/*
	 * This can iterate a massively long zone without finding any
	 * suitable migration targets, so periodically check if we need
	 * to schedule, or even abort async compaction.
	 */
	if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
					&& compact_should_abort(cc))
		break;

	page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
								zone);
	if (!page)
		continue;

检查该页块是否适合成为迁移的目标.

1
2
3

/* Check the block is suitable for migration */
if (!suitable_migration_target(cc, page))
	continue;

当且仅当 ( (该页面不是空闲) || (是空闲但页面大小小于页块大小) ) && (迁移类型为MIGRATE_MOVABLE或MIGRATE_CMA)时, 页面适合成为迁移目标.

/* Returns true if the page is within a block suitable for migration to */
static bool suitable_migration_target(struct compact_control *cc,
							struct page *page)
{
	/* If the page is a large free page, then disallow migration */
	if (PageBuddy(page)) {
		/*
		 * We are checking page_order without zone->lock taken. But
		 * the only small danger is that we skip a potentially suitable
		 * pageblock, so it's not worth to check order for valid range.
		 */
		if (page_order_unsafe(page) >= pageblock_order)
			return false;
	}

	if (cc->ignore_block_suitable)
		return true;

	/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
	if (is_migrate_movable(get_pageblock_migratetype(page)))
		return true;

	/* Otherwise skip the block */
	return false;
}

如果该页块近期隔离失败过(被标记PB_migrate_skip), 跳过该页块.

1
2
3

/* If isolation recently failed, do not retry */
if (!isolation_suitable(cc, page))
	continue;

然后调用isolate_freepages_block从该页块中隔离出空闲页面.

1
2
3

/* Found a block suitable for isolating free pages from. */
isolate_freepages_block(cc, &isolate_start_pfn, block_end_pfn,
			freelist, false);

循环上述过程直到获取到足够的freepages或扫描结束.map_pages将freelist中的高阶空闲页面拆成order-0的页面.

	if ((cc->nr_freepages >= cc->nr_migratepages)
						|| cc->contended) {
		if (isolate_start_pfn >= block_end_pfn) {
			/*
			 * Restart at previous pageblock if more
			 * freepages can be isolated next time.
			 */
			isolate_start_pfn =
				block_start_pfn - pageblock_nr_pages;
		}
		break;
	} else if (isolate_start_pfn < block_end_pfn) {
		/*
		 * If isolation failed early, do not continue
		 * needlessly.
		 */
		break;
	}
}
   /* __isolate_free_page() does not map the pages */
map_pages(freelist);

/*
 * Record where the free scanner will restart next time. Either we
 * broke from the loop and set isolate_start_pfn based on the last
 * call to isolate_freepages_block(), or we met the migration scanner
 * and the loop terminated due to isolate_start_pfn < low_pfn
 */
cc->free_pfn = isolate_start_pfn;

isolate_freepages_block的过程与isolate_migratepages_block很相似, 不再阐述.

/*
 * Isolate free pages onto a private freelist. If @strict is true, will abort
 * returning 0 on any invalid PFNs or non-free pages inside of the pageblock
 * (even though it may still end up isolating some pages).
 */
static unsigned long isolate_freepages_block(struct compact_control *cc,
				unsigned long *start_pfn,
				unsigned long end_pfn,
				struct list_head *freelist,
				bool strict)
{
	int nr_scanned = 0, total_isolated = 0;
	struct page *cursor, *valid_page = NULL;
	unsigned long flags = 0;
	bool locked = false;
	unsigned long blockpfn = *start_pfn;
	unsigned int order;

	cursor = pfn_to_page(blockpfn);

	/* Isolate free pages. */
	for (; blockpfn < end_pfn; blockpfn++, cursor++) {
		int isolated;
		struct page *page = cursor;

		/*
		 * Periodically drop the lock (if held) regardless of its
		 * contention, to give chance to IRQs. Abort if fatal signal
		 * pending or async compaction detects need_resched()
		 */
		if (!(blockpfn % SWAP_CLUSTER_MAX)
		    && compact_unlock_should_abort(&cc->zone->lock, flags,
								&locked, cc))
			break;

		nr_scanned++;
		if (!pfn_valid_within(blockpfn))
			goto isolate_fail;

		if (!valid_page)
			valid_page = page;

		/*
		 * For compound pages such as THP and hugetlbfs, we can save
		 * potentially a lot of iterations if we skip them at once.
		 * The check is racy, but we can consider only valid values
		 * and the only danger is skipping too much.
		 */
		if (PageCompound(page)) {
			const unsigned int order = compound_order(page);

			if (likely(order < MAX_ORDER)) {
				blockpfn += (1UL << order) - 1;
				cursor += (1UL << order) - 1;
			}
			goto isolate_fail;
		}

		if (!PageBuddy(page))
			goto isolate_fail;

		/*
		 * If we already hold the lock, we can skip some rechecking.
		 * Note that if we hold the lock now, checked_pageblock was
		 * already set in some previous iteration (or strict is true),
		 * so it is correct to skip the suitable migration target
		 * recheck as well.
		 */
		if (!locked) {
			/*
			 * The zone lock must be held to isolate freepages.
			 * Unfortunately this is a very coarse lock and can be
			 * heavily contended if there are parallel allocations
			 * or parallel compactions. For async compaction do not
			 * spin on the lock and we acquire the lock as late as
			 * possible.
			 */
			locked = compact_trylock_irqsave(&cc->zone->lock,
								&flags, cc);
			if (!locked)
				break;

			/* Recheck this is a buddy page under lock */
			if (!PageBuddy(page))
				goto isolate_fail;
		}

		/* Found a free page, will break it into order-0 pages */
		order = page_order(page);
		isolated = __isolate_free_page(page, order);
		if (!isolated)
			break;
		set_page_private(page, order);

		total_isolated += isolated;
		cc->nr_freepages += isolated;
		list_add_tail(&page->lru, freelist);

		if (!strict && cc->nr_migratepages <= cc->nr_freepages) {
			blockpfn += isolated;
			break;
		}
		/* Advance to the end of split page */
		blockpfn += isolated - 1;
		cursor += isolated - 1;
		continue;

isolate_fail:
		if (strict)
			break;
		else
			continue;

	}

	if (locked)
		spin_unlock_irqrestore(&cc->zone->lock, flags);

	/*
	 * There is a tiny chance that we have read bogus compound_order(),
	 * so be careful to not go outside of the pageblock.
	 */
	if (unlikely(blockpfn > end_pfn))
		blockpfn = end_pfn;

	trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn,
					nr_scanned, total_isolated);

	/* Record how far we have got within the block */
	*start_pfn = blockpfn;

	/*
	 * If strict isolation is requested by CMA then check that all the
	 * pages requested were isolated. If there were any failures, 0 is
	 * returned and CMA will fail.
	 */
	if (strict && blockpfn < end_pfn)
		total_isolated = 0;

	/* Update the pageblock-skip if the whole pageblock was scanned */
	if (blockpfn == end_pfn)
		update_pageblock_skip(cc, valid_page, total_isolated, false);

	cc->total_free_scanned += nr_scanned;
	if (total_isolated)
		count_compact_events(COMPACTISOLATED, total_isolated);
	return total_isolated;
}

参考文章

https://www.cnblogs.com/tolimit/p/5286663.html

版权声明： 本博客所有文章除特别声明外，著作权归作者所有。转载请注明出处！