Slub-Allocator 源码分析

CONFIG_SLAB_FREELIST_HARDENED毁了我的slub master梦

前言

写此文的目的,一是两位前辈的劝诫:

“要是一辈子都只能看分析,那永远也写不出分析”
“有问题先看源码,再看文档,一定要有读源码的意识”
“做Kernel不读Slub就像做User mode不读Ptmalloc2”.

二是最近在复现内核漏洞的过程中发现源码阅读有点吃力,遂选取逻辑性较强的内存管理部分的源码进行练习.
三是beta24开发的gef插件,方便了我很多工作,我想知道其的实现.

本文主要对Slub Allocator的分配与释放进行流程上的分析.因笔者水平有限且内容过多,难免分析有误,若有错误请师傅们联系我修改.代码来自linux-6.6.7

添加注释的源码:https://github.com/Polaris-Snowfall/linux-kernel-comment/blob/main/slub.comment.c.

结构分析

kmem_cache (特定大小或特殊用途对象的内存池)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
/*
* Slab cache management.
*/
struct kmem_cache {
#ifndef CONFIG_SLUB_TINY
struct kmem_cache_cpu __percpu *cpu_slab;
#endif
/* Used for retrieving partial slabs, etc. */
slab_flags_t flags;
unsigned long min_partial; //node partial 链表上slab数量的下限
unsigned int size; /* The size of an object including metadata */
unsigned int object_size;/* The size of an object without metadata */
struct reciprocal_value reciprocal_size;
//不同kmem_cache管理的空闲对象,free pointer在对象上的偏移不同,用offset指示
unsigned int offset; /* Free pointer offset */
#ifdef CONFIG_SLUB_CPU_PARTIAL
/*kmem_cache_cpu->partial最多能保存的对象个数*/
unsigned int cpu_partial;
/*kmem_cache_cpu->partial最多能保存的slab个数*/
unsigned int cpu_partial_slabs;
#endif
/*低16位为@order的slab有多少个大小为@size的对象,高16位为@order*/
struct kmem_cache_order_objects oo;

/* Allocation and freeing of slabs */
struct kmem_cache_order_objects min;
gfp_t allocflags; /* gfp flags to use on each alloc */
int refcount; /* Refcount for slab cache destroy */
void (*ctor)(void *);
unsigned int inuse; /* Offset to metadata */
unsigned int align; /* Alignment */
unsigned int red_left_pad; /* Left redzone padding size */
const char *name; /* Name (only for display!) */
struct list_head list; /* List of slab caches */
#ifdef CONFIG_SYSFS
struct kobject kobj; /* For sysfs */
#endif
#ifdef CONFIG_SLAB_FREELIST_HARDENED
unsigned long random; //用作free pointer加密的异或值
#endif

#ifdef CONFIG_NUMA
/*
* Defragmentation by allocating from a remote node.
*/
unsigned int remote_node_defrag_ratio;
#endif

#ifdef CONFIG_SLAB_FREELIST_RANDOM
unsigned int *random_seq;
#endif

#ifdef CONFIG_KASAN_GENERIC
struct kasan_cache kasan_info;
#endif

#ifdef CONFIG_HARDENED_USERCOPY
unsigned int useroffset; /* Usercopy region offset */
unsigned int usersize; /* Usercopy region size */
#endif

struct kmem_cache_node *node[MAX_NUMNODES];
};

kmem_cache_cpu

kmem_cache对于每个cpu有一个kmem_cache_cpu结构,用于快速处理分配释放请求.
三个关键结构.
freelist: 无锁的空闲链表,快速分配.
slab: 当前cpu正在使用的slab
partial: 一些未在使用且部分空闲的slab

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
struct kmem_cache_cpu {
union {
struct {
void **freelist; /* Pointer to next available object */
unsigned long tid; /* Globally unique transaction id */
};
freelist_aba_t freelist_tid;
};
struct slab *slab; /* The slab from which we are allocating */
#ifdef CONFIG_SLUB_CPU_PARTIAL
struct slab *partial; /* Partially allocated frozen slabs */
#endif
local_lock_t lock; /* Protects the fields above */
#ifdef CONFIG_SLUB_STATS
unsigned stat[NR_SLUB_STAT_ITEMS];
#endif
};

kmem_cache_node

对于slub来说,关注partial链表即可.可以看作是kmem_cache_cpu与buddy system的中间层.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
/*
* The slab lists for all objects.
*/
struct kmem_cache_node {
#ifdef CONFIG_SLAB
raw_spinlock_t list_lock;
struct list_head slabs_partial; /* partial list first, better asm code */
struct list_head slabs_full;
struct list_head slabs_free;
unsigned long total_slabs; /* length of all slab lists */
unsigned long free_slabs; /* length of free slab list only */
unsigned long free_objects;
unsigned int free_limit;
unsigned int colour_next; /* Per-node cache coloring */
struct array_cache *shared; /* shared per node */
struct alien_cache **alien; /* on other nodes */
unsigned long next_reap; /* updated without locking */
int free_touched; /* updated without locking */
#endif

#ifdef CONFIG_SLUB
spinlock_t list_lock;
unsigned long nr_partial;
struct list_head partial;
#ifdef CONFIG_SLUB_DEBUG
atomic_long_t nr_slabs;
atomic_long_t total_objects;
struct list_head full;
#endif
#endif

};

slab

向buddy system一次请求的连续页面称作slab.
在linux kernel的代码中, struct slab,struct folio,struct page本质上其实是一个东西(之前slab是直接作为联合体嵌入在struct page中).由于一个slab可能是多个连续页面,该结构保存在仅第一个页面中.
所以可以直接从虚拟地址转化到对应的slab.

下列是需要关注的结构;
slab_cache : 指向对应的kmem_cache.也就实现了虚拟地址转换到kmem_cache.
slab_list: 组织kmem_cache_node中双向链表的结构.
next: 组织kmeme_cache_cpu中partial单向链表的结构.
slabs: 当前slab之后的slab数量,用于kmeme_cache_cpu中partial单向链表的计数.
freelist; 记录slab中空闲对象的链表
inuse: 当前slab正在使用的对象数
objects: 当前slab包含的对象数
frozen: 当前slab是否为冻结状态.(解释一下,冻结就是分给特定的cpu了,位于kmem_cache_cpu->slab或partial)
rcu_head: 用于rcu回调的.

inuse,objects,frozen整体用counters来表示(联合体),便于整体更新.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
/* Reuses the bits in struct page */
struct slab {
unsigned long __page_flags;

#if defined(CONFIG_SLAB)

struct kmem_cache *slab_cache;
union {
struct {
struct list_head slab_list;
void *freelist; /* array of free object indexes */
void *s_mem; /* first object */
};
struct rcu_head rcu_head;
};
unsigned int active;

#elif defined(CONFIG_SLUB)

struct kmem_cache *slab_cache;
union {
struct {
union {
struct list_head slab_list;
#ifdef CONFIG_SLUB_CPU_PARTIAL
struct {
struct slab *next;
int slabs; /* Nr of slabs left */
};
#endif
};
/* Double-word boundary */
union {
struct {
void *freelist; /* first free object */
union {
unsigned long counters;
struct {
unsigned inuse:16;
unsigned objects:15;
unsigned frozen:1;
};
};
};
#ifdef system_has_freelist_aba
freelist_aba_t freelist_counter;
#endif
};
};
struct rcu_head rcu_head;
};
unsigned int __unused;

#else
#error "Unexpected slab allocator configured"
#endif

atomic_t __page_refcount;
#ifdef CONFIG_MEMCG
unsigned long memcg_data;
#endif
};

流程分析

slab_alloc_node

内核中通过slab allocator进行内存分配的接口最终都会调用到slab_alloc_node进行实际的内存分配.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
/*
* 如果无锁的空闲链表可以使用则通过FASTPATH进行分配,
* 否则通过__slab_alloc进行SLOWPATH
*/
static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list_lru *lru,
gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
{
void *object;
struct obj_cgroup *objcg = NULL;
bool init = false;

s = slab_pre_alloc_hook(s, lru, &objcg, 1, gfpflags);
if (!s)
return NULL;

object = kfence_alloc(s, orig_size, gfpflags);
if (unlikely(object))
goto out;

object = __slab_alloc_node(s, gfpflags, node, addr, orig_size);

maybe_wipe_obj_freeptr(s, object);
init = slab_want_init_on_alloc(gfpflags, s);

out:
/*
* When init equals 'true', like for kzalloc() family, only
* @orig_size bytes might be zeroed instead of s->object_size
*/
slab_post_alloc_hook(s, objcg, gfpflags, 1, &object, init, orig_size);

return object;
}

实际的处理逻辑在__slab_alloc_node.FASTPATH的处理是直接取kmem_cache_cpu->freelist的第一个对象返回.当然进入FASTPATH的前提是要c->freelist&&c->slab.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
static __always_inline void *__slab_alloc_node(struct kmem_cache *s,
gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
{
struct kmem_cache_cpu *c;
struct slab *slab;
unsigned long tid;
void *object;

redo:
/*
* Must read kmem_cache cpu data via this cpu ptr. Preemption is
* enabled. We may switch back and forth between cpus while
* reading from one cpu area. That does not matter as long
* as we end up on the original cpu again when doing the cmpxchg.
*
* We must guarantee that tid and kmem_cache_cpu are retrieved on the
* same cpu. We read first the kmem_cache_cpu pointer and use it to read
* the tid. If we are preempted and switched to another cpu between the
* two reads, it's OK as the two are still associated with the same cpu
* and cmpxchg later will validate the cpu.
*/
c = raw_cpu_ptr(s->cpu_slab);
tid = READ_ONCE(c->tid);

/*
* Irqless object alloc/free algorithm used here depends on sequence
* of fetching cpu_slab's data. tid should be fetched before anything
* on c to guarantee that object and slab associated with previous tid
* won't be used with current tid. If we fetch tid first, object and
* slab could be one associated with next tid and our alloc/free
* request will be failed. In this case, we will retry. So, no problem.
*/
barrier();

/*
* The transaction ids are globally unique per cpu and per operation on
* a per cpu queue. Thus they can be guarantee that the cmpxchg_double
* occurs on the right processor and that there was no operation on the
* linked list in between.
*/

object = c->freelist;
slab = c->slab;

if (!USE_LOCKLESS_FAST_PATH() ||
unlikely(!object || !slab || !node_match(slab, node))) {
//SLOWPATH
object = __slab_alloc(s, gfpflags, node, addr, c, orig_size);
} else {
//FASTPATH,直接取kmem_cache_cpu->freelist的第一个对象
void *next_object = get_freepointer_safe(s, object);

/*
* The cmpxchg will only match if there was no additional
* operation and if we are on the right processor.
*
* The cmpxchg does the following atomically (without lock
* semantics!)
* 1. Relocate first pointer to the current per cpu area.
* 2. Verify that tid and freelist have not been changed
* 3. If they were not changed replace tid and freelist
*
* Since this is without lock semantics the protection is only
* against code executing on this cpu *not* from access by
* other cpus.
*/
if (unlikely(!__update_cpu_freelist_fast(s, object, next_object, tid))) {
note_cmpxchg_failure("slab_alloc", s, tid);
goto redo;
}
prefetch_freepointer(s, next_object);
stat(s, ALLOC_FASTPATH);
}

return object;
}

函数___slab_alloc是SLOWPATH的主要处理逻辑,由于其比较复杂且本来就是模块化的,这里逐块进行分析.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
/*
* Slow path. 当无锁的空闲链表为空时或进行调试时使用.
*
* 若有新的对象已经被释放到常规的空闲链表中了,处理仍旧非常快.
* 只需要将常规空闲链表作为无锁的空闲链表并清空常规的空闲链表.
*
* 如果这不起作用,我们回退到partial链表.
* 我们取走partial链表中的slab的freelist中的
* 第一个对象作为本次分配的对象返回,然后将空闲链表中
* 剩下的对象移动到无锁的空闲链表中.
*
* 如果我们不能从partial slab链表中获取到新的slab,
* 则需要分配一个新的slab.这是最慢的路径因为它需要调用
* 页极分配器并初始化一个新的slab.
*
* __slab_alloc这一版本是当我们知道抢占已经被禁用时使用的.
*/
static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
{
void *freelist;
struct slab *slab;
unsigned long flags;
struct partial_context pc;

stat(s, ALLOC_SLOWPATH);

reread_slab

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
//读取c->slab到局部变量中,便于访问修改
reread_slab:

slab = READ_ONCE(c->slab);
if (!slab) {
/*
* if the node is not online or has no normal memory, just
* ignore the node constraint
*/
//该kmem_cache_cpu没有正在使用的slab,跳转到new_slab
if (unlikely(node != NUMA_NO_NODE &&
!node_isset(node, slab_nodes)))
node = NUMA_NO_NODE;
goto new_slab;
}

redo

主要是进行一些检测来将处理逻辑分发到不同处理模块

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
redo:

if (unlikely(!node_match(slab, node))) {
/*
* same as above but node_match() being false already
* implies node != NUMA_NO_NODE
*/
if (!node_isset(node, slab_nodes)) {
node = NUMA_NO_NODE;
} else {
stat(s, ALLOC_NODE_MISMATCH);
goto deactivate_slab;
}
}

/*
* By rights, we should be searching for a slab page that was
* PFMEMALLOC but right now, we are losing the pfmemalloc
* information when the page leaves the per-cpu allocator
*/
if (unlikely(!pfmemalloc_match(slab, gfpflags)))
goto deactivate_slab;

/* must check again c->slab in case we got preempted and it changed */
local_lock_irqsave(&s->cpu_slab->lock, flags);
if (unlikely(slab != c->slab)) {
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
goto reread_slab;
}
freelist = c->freelist;
if (freelist)
goto load_freelist;

//kmem_cache_cpu中freelist为空,尝试从kmem_cache_cpu->slab中获取freelist
freelist = get_freelist(s, slab);

//若还是未取到,说明当前正在使用的slab是满的,尝试获取新的slab.
if (!freelist) {
c->slab = NULL;
c->tid = next_tid(c->tid);
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
stat(s, DEACTIVATE_BYPASS);
goto new_slab;
}

stat(s, ALLOC_REFILL);
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
/*
* 检查slab的空闲链表并决定要么将其转移
* 到per cpu空闲链表中,要么对该slab进行deactivate.
*
* 若返回值非空则该slab仍处于冻结状态
* 若返回值为空则该slab已经解冻.
*/
static inline void *get_freelist(struct kmem_cache *s, struct slab *slab)
{
struct slab new;
unsigned long counters;
void *freelist;

lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));

do {
freelist = slab->freelist;
counters = slab->counters;

new.counters = counters;
VM_BUG_ON(!new.frozen);

new.inuse = slab->objects;
new.frozen = freelist != NULL;

} while (!__slab_update_freelist(s, slab,
freelist, counters,
NULL, new.counters,
"get_freelist"));

return freelist;
}

load_freelist

这一模块是用来处理通过各种途径后已经获取到freelist之后的逻辑.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
/*
* 装载freelist链表,更新kmem_cache->freelist为空闲对象
* 链表中第二个对象,返回当前空闲对象中第一个对象(因为本次
* __alloc_slab会直接分配一个对象).
*/
load_freelist:

lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));

/*
* freelist is pointing to the list of objects to be used.
* slab is pointing to the slab from which the objects are obtained.
* That slab must be frozen for per cpu allocations to work.
*/
VM_BUG_ON(!c->slab->frozen);
c->freelist = get_freepointer(s, freelist);
c->tid = next_tid(c->tid);
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
return freelist;

1
2
3
4
5
6
7
8
9
10
11
12
13
// 获取该@object对应freelist中下一个对象的地址.
static inline void *get_freepointer(struct kmem_cache *s, void *object)
{
unsigned long ptr_addr;
freeptr_t p;

object = kasan_reset_tag(object);
//freelist的指针在object中有一个kmem_cache->offset的偏移
ptr_addr = (unsigned long)object + s->offset;
p = *(freeptr_t *)(ptr_addr);
return freelist_ptr_decode(s, p, ptr_addr);
}

deactivate_slab

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
/*
* 将kmem_cache_cpu->slab->freelist合并连接到
* kmem_cache_cpu->freelist之后,解冻slab并根据合并后
* slab情况放入对应的kmem_cache_node的partial/full
* 链表中
*/
deactivate_slab:
local_lock_irqsave(&s->cpu_slab->lock, flags);
//处理并发问题
if (slab != c->slab) {
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
goto reread_slab;
}
freelist = c->freelist;
c->slab = NULL;
c->freelist = NULL;
c->tid = next_tid(c->tid);
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
deactivate_slab(s, slab, freelist);
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
/*
* 结束移除cpu slab的操作.将cpu的空闲链表和slab
* 的空闲链表合并,解冻slab并放到正确的链表中.
* 假设slab已经被调用者安全的从kmem_cache_cpu中移除
*/
static void deactivate_slab(struct kmem_cache *s, struct slab *slab,
void *freelist)
{
enum slab_modes { M_NONE, M_PARTIAL, M_FREE, M_FULL_NOLIST };
struct kmem_cache_node *n = get_node(s, slab_nid(slab));
int free_delta = 0;
enum slab_modes mode = M_NONE;
void *nextfree, *freelist_iter, *freelist_tail;
int tail = DEACTIVATE_TO_HEAD;
unsigned long flags = 0;
struct slab new;
struct slab old;

if (slab->freelist) {
stat(s, DEACTIVATE_REMOTE_FREES);
tail = DEACTIVATE_TO_TAIL;
}

/*
* Stage one: Count the objects on cpu's freelist as free_delta and
* remember the last object in freelist_tail for later splicing.
*/
freelist_tail = NULL;
freelist_iter = freelist;
while (freelist_iter) {
nextfree = get_freepointer(s, freelist_iter);

/*
* If 'nextfree' is invalid, it is possible that the object at
* 'freelist_iter' is already corrupted. So isolate all objects
* starting at 'freelist_iter' by skipping them.
*/
if (freelist_corrupted(s, slab, &freelist_iter, nextfree))
break;

freelist_tail = freelist_iter;
free_delta++;

freelist_iter = nextfree;
}

/*
* Stage two: Unfreeze the slab while splicing the per-cpu
* freelist to the head of slab's freelist.
*
* Ensure that the slab is unfrozen while the list presence
* reflects the actual number of objects during unfreeze.
*
* We first perform cmpxchg holding lock and insert to list
* when it succeed. If there is mismatch then the slab is not
* unfrozen and number of objects in the slab may have changed.
* Then release lock and retry cmpxchg again.
*/
redo:

old.freelist = READ_ONCE(slab->freelist);
old.counters = READ_ONCE(slab->counters);
VM_BUG_ON(!old.frozen);

/* Determine target state of the slab */
new.counters = old.counters;
if (freelist_tail) {
new.inuse -= free_delta;
set_freepointer(s, freelist_tail, old.freelist);
new.freelist = freelist;
} else
new.freelist = old.freelist;

new.frozen = 0;

if (!new.inuse && n->nr_partial >= s->min_partial) {
mode = M_FREE;
} else if (new.freelist) {
mode = M_PARTIAL;
/*
* Taking the spinlock removes the possibility that
* acquire_slab() will see a slab that is frozen
*/
spin_lock_irqsave(&n->list_lock, flags);
} else {
mode = M_FULL_NOLIST;
}


if (!slab_update_freelist(s, slab,
old.freelist, old.counters,
new.freelist, new.counters,
"unfreezing slab")) {
if (mode == M_PARTIAL)
spin_unlock_irqrestore(&n->list_lock, flags);
goto redo;
}


if (mode == M_PARTIAL) {
add_partial(n, slab, tail);
spin_unlock_irqrestore(&n->list_lock, flags);
stat(s, tail);
} else if (mode == M_FREE) {
stat(s, DEACTIVATE_EMPTY);
discard_slab(s, slab);
stat(s, FREE_SLAB);
} else if (mode == M_FULL_NOLIST) {
stat(s, DEACTIVATE_FULL);
}
}

new_slab

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
/*
* 尝试从kmem_cache_cpu->partial中获取slab并
* 设置为kmem_cache_cpu->slab
*/
new_slab:

if (slub_percpu_partial(c)) {
local_lock_irqsave(&s->cpu_slab->lock, flags);
if (unlikely(c->slab)) {
//处理并发问题
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
goto reread_slab;
}
if (unlikely(!slub_percpu_partial(c))) {
//处理并发问题
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
/* we were preempted and partial list got empty */
goto new_objects;
}

slab = c->slab = slub_percpu_partial(c);
slub_set_percpu_partial(c, slab); //更新partial链表
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
stat(s, CPU_PARTIAL_ALLOC);
goto redo;
}

#define slub_percpu_partial(c) ((c)->partial)

new_objects

如果从kmem_cache_cpu->partial获取slab失败,则回退到kmem_cache_node进行获取.
首先尝试从kmem_cache_node->partial中获取slab.若失败则调用new_slab函数(注意与上面的new_slab label不是一个东西)向budy system申请新的slab.
两种方式获取到的slab都设为full状态并冻结,方便之后装载到kmem_cache_cpu->slab.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
/*
* 尝试从kmem_cache_node->partial中获取slab.
* 若失败则调用new_slab向budy system申请新的slab
*/
new_objects:

pc.flags = gfpflags;
pc.slab = &slab;
pc.orig_size = orig_size;
freelist = get_partial(s, node, &pc);
if (freelist)
goto check_new_slab;

slub_put_cpu_ptr(s->cpu_slab);
slab = new_slab(s, gfpflags, node);
c = slub_get_cpu_ptr(s->cpu_slab);

if (unlikely(!slab)) {
slab_out_of_memory(s, gfpflags, node);
return NULL;
}

stat(s, ALLOC_SLAB);

if (kmem_cache_debug(s)) {
freelist = alloc_single_from_new_slab(s, slab, orig_size);

if (unlikely(!freelist))
goto new_objects;

if (s->flags & SLAB_STORE_USER)
set_track(s, freelist, TRACK_ALLOC, addr);

return freelist;
}

/*
* No other reference to the slab yet so we can
* muck around with it freely without cmpxchg
*/
freelist = slab->freelist;
slab->freelist = NULL;
slab->inuse = slab->objects;
slab->frozen = 1;

//更新kmem_cache_node->nr_slabs,total_objects
inc_slabs_node(s, slab_nid(slab), slab->objects);

get_partial函数首先尝试get_partial_node从当前的kmem_cache_node中分配slab,若失败且node指定为NUMA_NO_NODE,则进行get_any_partial搜索其他的kmem_cache_node进行分配.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18

/*
* Get a partial slab, lock it and return it.
*/
static void *get_partial(struct kmem_cache *s, int node, struct partial_context *pc)
{
void *object;
int searchnode = node;

if (node == NUMA_NO_NODE)
searchnode = numa_mem_id();

object = get_partial_node(s, get_node(s, searchnode), pc);
if (object || node != NUMA_NO_NODE)
return object;

return get_any_partial(s, pc);
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
/*
* 遍历kmem_cache_node->partial尝试获取.
* Try to allocate a partial slab from a specific node.
*/
static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
struct partial_context *pc)
{
struct slab *slab, *slab2;
void *object = NULL;
unsigned long flags;
unsigned int partial_slabs = 0;

/*
* Racy check. If we mistakenly see no partial slabs then we
* just allocate an empty slab. If we mistakenly try to get a
* partial slab and there is none available then get_partial()
* will return NULL.
*/
if (!n || !n->nr_partial)
return NULL;

spin_lock_irqsave(&n->list_lock, flags);
list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) {
void *t;

if (!pfmemalloc_match(slab, pc->flags))
continue;

if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
object = alloc_single_from_partial(s, n, slab,
pc->orig_size);
if (object)
break;
continue;
}

t = acquire_slab(s, n, slab, object == NULL);
if (!t)
break;

if (!object) {
*pc->slab = slab;
stat(s, ALLOC_FROM_PARTIAL);
object = t;
} else {
put_cpu_partial(s, slab, 0);
stat(s, CPU_PARTIAL_NODE);
partial_slabs++;
}
#ifdef CONFIG_SLUB_CPU_PARTIAL
if (!kmem_cache_has_cpu_partial(s)
|| partial_slabs > s->cpu_partial_slabs / 2)
break;
#else
break;
#endif

}
spin_unlock_irqrestore(&n->list_lock, flags);
return object;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
/*
* 将slab从kmem_cache_node->partial中移除.
* 设置为full状态并冻结该slab,返回slab->freelist.
*/
static inline void *acquire_slab(struct kmem_cache *s,
struct kmem_cache_node *n, struct slab *slab,
int mode)
{
void *freelist;
unsigned long counters;
struct slab new;

lockdep_assert_held(&n->list_lock);

/*
* Zap the freelist and set the frozen bit.
* The old freelist is the list of objects for the
* per cpu allocation list.
*/
freelist = slab->freelist;
counters = slab->counters;
new.counters = counters;
if (mode) {
new.inuse = slab->objects;
new.freelist = NULL;
} else {
new.freelist = freelist;
}

VM_BUG_ON(new.frozen);
new.frozen = 1;

if (!__slab_update_freelist(s, slab,
freelist, counters,
new.freelist, new.counters,
"acquire_slab"))
return NULL;

remove_partial(n, slab);
WARN_ON(!freelist);
return freelist;
}
1
2
3
4
5
6
7
8
9
10
static struct slab *new_slab(struct kmem_cache *s, gfp_t flags, int node)
{
if (unlikely(flags & GFP_SLAB_BUG_MASK))
flags = kmalloc_fix_flags(flags);

WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO));

return allocate_slab(s,
flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
}

核心的分配slab的过程在allocate_slab中实现.
向底层buddy system请求连续页面作为slab,根据请求对slab以及slab->freelist进行初始化.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
{
struct slab *slab;
struct kmem_cache_order_objects oo = s->oo;
gfp_t alloc_gfp;
void *start, *p, *next;
int idx;
bool shuffle;

flags &= gfp_allowed_mask;

flags |= s->allocflags;

/*
* Let the initial higher-order allocation fail under memory pressure
* so we fall-back to the minimum order allocation.
*/
alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))
alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_RECLAIM;

slab = alloc_slab_page(alloc_gfp, node, oo);
if (unlikely(!slab)) {
oo = s->min;
alloc_gfp = flags;
/*
* Allocation may have failed due to fragmentation.
* Try a lower order alloc if possible
*/
slab = alloc_slab_page(alloc_gfp, node, oo);
if (unlikely(!slab))
return NULL;
stat(s, ORDER_FALLBACK);
}

slab->objects = oo_objects(oo);
slab->inuse = 0;
slab->frozen = 0;

account_slab(slab, oo_order(oo), s, flags);

slab->slab_cache = s;

kasan_poison_slab(slab);

start = slab_address(slab);

setup_slab_debug(s, slab, start);

//CONFIG_SLAB_FREELIST_RANDOM
shuffle = shuffle_freelist(s, slab);
//如果没开就从低地址向高地址串成链表
if (!shuffle) {
start = fixup_red_left(s, start);
start = setup_object(s, start);
slab->freelist = start;
for (idx = 0, p = start; idx < slab->objects - 1; idx++) {
next = p + s->size;
next = setup_object(s, next);
set_freepointer(s, p, next);
p = next;
}
set_freepointer(s, p, NULL);
}

return slab;
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
/*
* Slab allocation and freeing
*/
static inline struct slab *alloc_slab_page(gfp_t flags, int node,
struct kmem_cache_order_objects oo)
{
struct folio *folio;
struct slab *slab;
unsigned int order = oo_order(oo);

if (node == NUMA_NO_NODE)
folio = (struct folio *)alloc_pages(flags, order);
else
folio = (struct folio *)__alloc_pages_node(node, flags, order);

if (!folio)
return NULL;

slab = folio_slab(folio);
__folio_set_slab(folio);
/* Make the flag visible before any changes to folio->mapping */
smp_wmb();
if (folio_is_pfmemalloc(folio))
slab_set_pfmemalloc(slab);

return slab;
}

对于开启了CONFIG_SLAB_FREELIST_RANDOM,shuffle_freelist会打乱freelist的顺序.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
/* Shuffle the single linked freelist based on a random pre-computed sequence */
static bool shuffle_freelist(struct kmem_cache *s, struct slab *slab)
{
void *start;
void *cur;
void *next;
unsigned long idx, pos, page_limit, freelist_count;

if (slab->objects < 2 || !s->random_seq)
return false;

freelist_count = oo_objects(s->oo);
pos = get_random_u32_below(freelist_count);

page_limit = slab->objects * s->size;
start = fixup_red_left(s, slab_address(slab));

/* First entry is used as the base of the freelist */
cur = next_freelist_entry(s, slab, &pos, start, page_limit,
freelist_count);
cur = setup_object(s, cur);
slab->freelist = cur;

for (idx = 1; idx < slab->objects; idx++) {
next = next_freelist_entry(s, slab, &pos, start, page_limit,
freelist_count);
next = setup_object(s, next);
set_freepointer(s, cur, next);
cur = next;
}
set_freepointer(s, cur, NULL);

return true;
}

check_new_slab

一些check.检查slab和标志位是否匹配,若不匹配则将其deactivate

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
check_new_slab:

if (kmem_cache_debug(s)) {
/*
* For debug caches here we had to go through
* alloc_single_from_partial() so just store the tracking info
* and return the object
*/
if (s->flags & SLAB_STORE_USER)
set_track(s, freelist, TRACK_ALLOC, addr);

return freelist;
}

if (unlikely(!pfmemalloc_match(slab, gfpflags))) {
/*
* For !pfmemalloc_match() case we don't load freelist so that
* we don't make further mismatched allocations easier.
*/
deactivate_slab(s, slab, get_freepointer(s, freelist));
return freelist;
}

retry_load_slab

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
/*
* 再次尝试装载slab,如果kmem_cache_cpu上已经有
* slab了(并发问题),进行deactivate直到成功装载
* 新的slab
*/
retry_load_slab:

local_lock_irqsave(&s->cpu_slab->lock, flags);
if (unlikely(c->slab)) {
void *flush_freelist = c->freelist;
struct slab *flush_slab = c->slab;

c->slab = NULL;
c->freelist = NULL;
c->tid = next_tid(c->tid);

local_unlock_irqrestore(&s->cpu_slab->lock, flags);

deactivate_slab(s, flush_slab, flush_freelist);

stat(s, CPUSLAB_FLUSH);

goto retry_load_slab;
}
c->slab = slab;

goto load_freelist;
}

完整回顾

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
{
void *freelist;
struct slab *slab;
unsigned long flags;
struct partial_context pc;

stat(s, ALLOC_SLOWPATH);

//读取c->slab到局部变量中,便于访问修改
reread_slab:

slab = READ_ONCE(c->slab);
if (!slab) {
/*
* if the node is not online or has no normal memory, just
* ignore the node constraint
*/
//该kmem_cache_cpu没有正在使用的slab,跳转到new_slab
if (unlikely(node != NUMA_NO_NODE &&
!node_isset(node, slab_nodes)))
node = NUMA_NO_NODE;
goto new_slab;
}
redo:

if (unlikely(!node_match(slab, node))) {
/*
* same as above but node_match() being false already
* implies node != NUMA_NO_NODE
*/
if (!node_isset(node, slab_nodes)) {
node = NUMA_NO_NODE;
} else {
stat(s, ALLOC_NODE_MISMATCH);
goto deactivate_slab;
}
}

/*
* By rights, we should be searching for a slab page that was
* PFMEMALLOC but right now, we are losing the pfmemalloc
* information when the page leaves the per-cpu allocator
*/
if (unlikely(!pfmemalloc_match(slab, gfpflags)))
goto deactivate_slab;

/* must check again c->slab in case we got preempted and it changed */
local_lock_irqsave(&s->cpu_slab->lock, flags);
if (unlikely(slab != c->slab)) {
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
goto reread_slab;
}
freelist = c->freelist;
if (freelist)
goto load_freelist;

//kmem_cache_cpu中freelist为空,尝试从kmem_cache_cpu->slab中获取freelist
freelist = get_freelist(s, slab);

//若还是未取到,说明当前正在使用的slab是满的,尝试获取新的slab.
if (!freelist) {
c->slab = NULL;
c->tid = next_tid(c->tid);
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
stat(s, DEACTIVATE_BYPASS);
goto new_slab;
}

stat(s, ALLOC_REFILL);

/*
* 装载freelist链表,更新kmem_cache->freelist为空闲对象
* 链表中第二个对象,返回当前空闲对象中第一个对象(因为本次
* __alloc_slab会直接分配一个对象).
*/
load_freelist:

lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));

/*
* freelist is pointing to the list of objects to be used.
* slab is pointing to the slab from which the objects are obtained.
* That slab must be frozen for per cpu allocations to work.
*/
VM_BUG_ON(!c->slab->frozen);
c->freelist = get_freepointer(s, freelist);
c->tid = next_tid(c->tid);
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
return freelist;


/*
* 将kmem_cache_cpu->slab->freelist合并连接到
* kmem_cache_cpu->freelist之后,解冻并根据合并后
* slab情况放入对应的kmem_cache_node的partial/full
* 链表中
*/
deactivate_slab:
local_lock_irqsave(&s->cpu_slab->lock, flags);
if (slab != c->slab) {
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
goto reread_slab;
}
freelist = c->freelist;
c->slab = NULL;
c->freelist = NULL;
c->tid = next_tid(c->tid);
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
deactivate_slab(s, slab, freelist);

/*
* 尝试从kmem_cache_cpu->partial中获取slab
*/
new_slab:

if (slub_percpu_partial(c)) {
local_lock_irqsave(&s->cpu_slab->lock, flags);
if (unlikely(c->slab)) {
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
goto reread_slab;
}
if (unlikely(!slub_percpu_partial(c))) {
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
/* we were preempted and partial list got empty */
goto new_objects;
}

slab = c->slab = slub_percpu_partial(c);
slub_set_percpu_partial(c, slab); //更新partial链表
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
stat(s, CPU_PARTIAL_ALLOC);
goto redo;
}

/*
* 尝试从kmem_cache_node->partial中获取slab.
* 若失败则调用new_slab向budy system申请新的slab
*/
new_objects:

pc.flags = gfpflags;
pc.slab = &slab;
pc.orig_size = orig_size;
freelist = get_partial(s, node, &pc);
if (freelist)
goto check_new_slab;

slub_put_cpu_ptr(s->cpu_slab);
slab = new_slab(s, gfpflags, node);
c = slub_get_cpu_ptr(s->cpu_slab);

if (unlikely(!slab)) {
slab_out_of_memory(s, gfpflags, node);
return NULL;
}

stat(s, ALLOC_SLAB);

if (kmem_cache_debug(s)) {
freelist = alloc_single_from_new_slab(s, slab, orig_size);

if (unlikely(!freelist))
goto new_objects;

if (s->flags & SLAB_STORE_USER)
set_track(s, freelist, TRACK_ALLOC, addr);

return freelist;
}

/*
* No other reference to the slab yet so we can
* muck around with it freely without cmpxchg
*/
freelist = slab->freelist;
slab->freelist = NULL;
slab->inuse = slab->objects;
slab->frozen = 1;

//更新kmem_cache_node->nr_slabs,total_objects
inc_slabs_node(s, slab_nid(slab), slab->objects);

check_new_slab:

if (kmem_cache_debug(s)) {
/*
* For debug caches here we had to go through
* alloc_single_from_partial() so just store the tracking info
* and return the object
*/
if (s->flags & SLAB_STORE_USER)
set_track(s, freelist, TRACK_ALLOC, addr);

return freelist;
}

if (unlikely(!pfmemalloc_match(slab, gfpflags))) {
/*
* For !pfmemalloc_match() case we don't load freelist so that
* we don't make further mismatched allocations easier.
*/
deactivate_slab(s, slab, get_freepointer(s, freelist));
return freelist;
}

/*
* 再次尝试装载slab,如果kmem_cache_cpu上已经有
* slab了(并发问题),进行deactivate直到成功装载
* 新的slab
*/
retry_load_slab:

local_lock_irqsave(&s->cpu_slab->lock, flags);
if (unlikely(c->slab)) {
void *flush_freelist = c->freelist;
struct slab *flush_slab = c->slab;

c->slab = NULL;
c->freelist = NULL;
c->tid = next_tid(c->tid);

local_unlock_irqrestore(&s->cpu_slab->lock, flags);

deactivate_slab(s, flush_slab, flush_freelist);

stat(s, CPUSLAB_FLUSH);

goto retry_load_slab;
}
c->slab = slab;

goto load_freelist;
}

slab_free

slab中释放内存最终都会调用到slab_free函数.
slab_free处理中的判断分支有点复杂,画图不好表示,还是看下面的分析吧.
简单概括一下:

  1. 将对象加入到对应slab的freelist中.
  2. 加入到kmem_cache_cpu->partial 或 回退到kmem_cache_node处理,更新kmem_cache_node的full和partial链表.


1
2
3
4
5
6
7
8
9
10
11
12
static __fastpath_inline void slab_free(struct kmem_cache *s, struct slab *slab,
void *head, void *tail, void **p, int cnt,
unsigned long addr)
{
memcg_slab_free_hook(s, slab, p, cnt);
/*
* With KASAN enabled slab_free_freelist_hook modifies the freelist
* to remove objects, whose reuse must be delayed.
*/
if (slab_free_freelist_hook(s, &head, &tail, &cnt))
do_slab_free(s, slab, head, tail, cnt, addr);
}

与分配相同,释放同样有FASTPATH和SLOWPATH.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
/*
*
* 仅当我们正在释放到kmem_cache_cpu->slab时才可能使用FASTPATH.
* 这通常是当我们才刚刚分配这个对象时.
*
* 如果不能通过FASTPATH,则回退到__slab_free来进行各种特殊处理
*
* 通过指示头尾指针以及对象数量,可以批量释放一个空闲链表中的多个
* 对象(指向同一slab). 通过设置尾指针来指示批量释放
*/
static __always_inline void do_slab_free(struct kmem_cache *s,
struct slab *slab, void *head, void *tail,
int cnt, unsigned long addr)
{
void *tail_obj = tail ? : head;
struct kmem_cache_cpu *c;
unsigned long tid;
void **freelist;

redo:
/*
* Determine the currently cpus per cpu slab.
* The cpu may change afterward. However that does not matter since
* data is retrieved via this pointer. If we are on the same cpu
* during the cmpxchg then the free will succeed.
*/
c = raw_cpu_ptr(s->cpu_slab);
tid = READ_ONCE(c->tid);

/* Same with comment on barrier() in slab_alloc_node() */
barrier();

//是否释放到当前cpu的slab,若不是进入SLOWPATH
if (unlikely(slab != c->slab)) {
__slab_free(s, slab, head, tail_obj, cnt, addr);
return;
}

//FASTPATH,直接将原freelist连到tail之后
//设置新的freelist为head.下面对应有锁无锁两种处理.
if (USE_LOCKLESS_FAST_PATH()) {
freelist = READ_ONCE(c->freelist);

set_freepointer(s, tail_obj, freelist);

if (unlikely(!__update_cpu_freelist_fast(s, freelist, head, tid))) {
note_cmpxchg_failure("slab_free", s, tid);
goto redo;
}
} else {
/* Update the free list under the local lock */
local_lock(&s->cpu_slab->lock);
c = this_cpu_ptr(s->cpu_slab);
if (unlikely(slab != c->slab)) {
local_unlock(&s->cpu_slab->lock);
goto redo;
}
tid = c->tid;
freelist = c->freelist;

set_freepointer(s, tail_obj, freelist);
c->freelist = head;
c->tid = next_tid(tid);

local_unlock(&s->cpu_slab->lock);
}
stat(s, FREE_FASTPATH);
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
/*
* SLOW PATH处理.这条路径仍然经常被调用,因为
* 在大多数处理中对象比cpu slab有更长的生命周期
*
* 所以我们仍然尽力减少缓存行的使用.只是持有slab lock并释放对象.
* 如果这里没有额外的partial slab处理需求,我们可以立即返回.
*/

static void __slab_free(struct kmem_cache *s, struct slab *slab,
void *head, void *tail, int cnt,
unsigned long addr)

{
void *prior;
int was_frozen;
struct slab new;
unsigned long counters;
struct kmem_cache_node *n = NULL;
unsigned long flags;

stat(s, FREE_SLOWPATH);

if (kfence_free(head))
return;

if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
free_to_partial_list(s, slab, head, tail, cnt, addr);
return;
}

do {
if (unlikely(n)) {
spin_unlock_irqrestore(&n->list_lock, flags);
n = NULL;
}
prior = slab->freelist;
counters = slab->counters;
set_freepointer(s, tail, prior);
new.counters = counters;
was_frozen = new.frozen;
new.inuse -= cnt;
if ((!new.inuse || !prior) && !was_frozen) {

if (kmem_cache_has_cpu_partial(s) && !prior) {
/*
* Slab was on no list before and will be
* partially empty
* We can defer the list move and instead
* freeze it.
*/
//原slab为full list且未冻结并且
//将要成为部分空闲
//我们可以推迟链表移动而是将其冻结
new.frozen = 1;

} else { /* Needs to be taken off a list */
/*
* 需要回退到kmem_cache_node进行处理.条件为:
* 1. 释放前slab未被冻结.(!was_frozen)
* 2. (释放后slab完全空闲) || (!(释放前该slab已满&&CONFIG_SLUB_CPU_PARTIAL))
*/
n = get_node(s, slab_nid(slab));
/*
* Speculatively acquire the list_lock.
* If the cmpxchg does not succeed then we may
* drop the list_lock without any processing.
*
* Otherwise the list_lock will synchronize with
* other processors updating the list of slabs.
*/
spin_lock_irqsave(&n->list_lock, flags);

}
}

} while (!slab_update_freelist(s, slab,
prior, counters,
head, new.counters,
"__slab_free"));
//至此freelist更新完毕,由于slab的状态可能发生改变,接下来
//是对slab的移动.

if (likely(!n)) {
//不需要回退到kmem_cache_node进行处理
if (likely(was_frozen)) {
//释放前slab为冻结状态,不做处理
/*
* The list lock was not taken therefore no list
* activity can be necessary.
*/
stat(s, FREE_FROZEN);
} else if (new.frozen) {
//释放后slab已被冻结,加入到当前kmem_cache_cpu->partial
/*
* If we just froze the slab then put it onto the
* per cpu partial list.
*/
put_cpu_partial(s, slab, 1);
stat(s, CPU_PARTIAL_FREE);
}

return;
}

if (unlikely(!new.inuse && n->nr_partial >= s->min_partial))
goto slab_empty;

/*
* Objects left in the slab. If it was not on the partial list before
* then add it.
*/
if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
//如果原slab是满的,且释放后不是完全空闲,则从full转移到partial
remove_full(s, n, slab);
add_partial(n, slab, DEACTIVATE_TO_TAIL);
stat(s, FREE_ADD_PARTIAL);
}
spin_unlock_irqrestore(&n->list_lock, flags);
return;

slab_empty:
//slab完全空闲且n->nr_partial大于等于下限min_partial,先移除再销毁
if (prior) {
/*
* Slab on the partial list.
*/
remove_partial(n, slab);
stat(s, FREE_REMOVE_PARTIAL);
} else {
/* Slab must be on the full list */
remove_full(s, n, slab);
}

spin_unlock_irqrestore(&n->list_lock, flags);
stat(s, FREE_SLAB);
discard_slab(s, slab);
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
/*
* 将刚刚冻结的slab移动到keme_cache_cpu->partial中
* 若已满则将旧的partial链表(old_slab)解冻并放到kmem_cache_node
* 中或销毁
*/
static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain)
{
struct slab *oldslab;
struct slab *slab_to_unfreeze = NULL;
unsigned long flags;
int slabs = 0;

local_lock_irqsave(&s->cpu_slab->lock, flags);

oldslab = this_cpu_read(s->cpu_slab->partial);

if (oldslab) {
if (drain && oldslab->slabs >= s->cpu_partial_slabs) {
/*
* Partial array is full. Move the existing set to the
* per node partial list. Postpone the actual unfreezing
* outside of the critical section.
*/
//如果kmem_cache_node中slabs已经超过cpu_partial_slabs,
//则使用slab替换old_slab,解冻old_slab并移动到
//kmem_cache_node->partial.注意这里的slab_to_unfreeze
//是一个链表
slab_to_unfreeze = oldslab;
oldslab = NULL;
} else {
slabs = oldslab->slabs;
}
}

slabs++;

//slab插入到oldslab之前(这里oldslab可能为空)
slab->slabs = slabs;
slab->next = oldslab;

//slab移动到kmem_cache_cpu->partial中
this_cpu_write(s->cpu_slab->partial, slab);

local_unlock_irqrestore(&s->cpu_slab->lock, flags);


if (slab_to_unfreeze) {
//解冻slab_to_unfreeze并移动到
//kmem_cache_node->partial或销毁.
//注意这里的slab_to_unfreeze是一个链表
__unfreeze_partials(s, slab_to_unfreeze);
stat(s, CPU_PARTIAL_DRAIN);
}
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
//解冻partial_slab并移动到
//kmem_cache_node->partial或销毁.
//注意这里的partial_slab是一个链表
static void __unfreeze_partials(struct kmem_cache *s, struct slab *partial_slab)
{
struct kmem_cache_node *n = NULL, *n2 = NULL;
struct slab *slab, *slab_to_discard = NULL;
unsigned long flags = 0;

while (partial_slab) {
struct slab new;
struct slab old;

slab = partial_slab;
partial_slab = slab->next;

n2 = get_node(s, slab_nid(slab));
if (n != n2) {
if (n)
spin_unlock_irqrestore(&n->list_lock, flags);

n = n2;
spin_lock_irqsave(&n->list_lock, flags);
}

do {

old.freelist = slab->freelist;
old.counters = slab->counters;
VM_BUG_ON(!old.frozen);

new.counters = old.counters;
new.freelist = old.freelist;

new.frozen = 0;

} while (!__slab_update_freelist(s, slab,
old.freelist, old.counters,
new.freelist, new.counters,
"unfreezing slab"));

if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) {
//如果该slab完全空闲且当前节点的nr_partial超过了下限min_partial
//则销毁该slab
slab->next = slab_to_discard;
slab_to_discard = slab;
} else {
//否则加入到kmem_cache_node->partial中.
//注意这里的标志DEACTIVATE_TO_TAIL,即插入到末尾
add_partial(n, slab, DEACTIVATE_TO_TAIL);
stat(s, FREE_ADD_PARTIAL);
}
}

if (n)
spin_unlock_irqrestore(&n->list_lock, flags);

while (slab_to_discard) {
slab = slab_to_discard;
slab_to_discard = slab_to_discard->next;

stat(s, DEACTIVATE_EMPTY);
discard_slab(s, slab);
stat(s, FREE_SLAB);
}
}

释放回底层buddy system,没啥好分析的.

1
2
3
4
5
static void discard_slab(struct kmem_cache *s, struct slab *slab)
{
dec_slabs_node(s, slab_nid(slab), slab->objects);
free_slab(s, slab);
}
  • 版权声明: 本博客所有文章除特别声明外,著作权归作者所有。转载请注明出处!
  • Copyrights © 2022-2024 翰青HanQi

请我喝杯咖啡吧~

支付宝
微信