您的位置:首页 > 运维架构 > Linux

linux内存管理之kmalloc

2015-10-01 23:50 666 查看
linux内存管理之kmalloc
http://blog.chinaunix.net/uid-20786208-id-4783115.html
这里只说物理内存管理 linux内核的,看了很多讲解的内存的东西,但是自己总结的时候总感觉无从下手,这里就从实际物理内存分配接口开始吧。

Kmalloc 它分配连续的物理内存空间 ,它不负责把分配的内存空间清零,它能分配多大的呢?并且它只能分配ZONE_NORMAL的不能分配dma和high里的,也就是只分配低端内存.一般情况下内存被分为三个zone:NORMAL、DMA、HIGH.

这个函数是建立在slab分配器的基础上的,通过cache 而cache有通过slab 分配obj 。

在开始分析kmalloc函数之前,我们需要说明一下linux内核物理内存的分配函数API:

__get_free_pages它会调用alloc_pages,它的特点是不能从HIGHMEM分配内存,分配2的幂个连续物理页面。它有简化模式(只分配一page)

__get_free_page,而get_zeroed_page接口分配的页面内容对应填充为0. 从dma分配可以调用__get_dma_pages(它本质也是调用__get_free_pages)

那么终极接口alloc_pages它可以从任何zone里申请内存,当然前提设置对应的flags.

参考内核:linux3.18.13

参考书籍:《linux内核设计与实现》《linux设备驱动程序》《深入理解linux设备驱动内核机制》

下面我们就说说kmalloc:(关于分配时候的flags这里不讨论,具体可以参考资料)

我们先看头文件

#include

而关于它的具体实现我们看slab.h中

点击(此处)折叠或打开

#ifdef CONFIG_SLUB

#include <linux/slub_def.h>

#elif defined(CONFIG_SLOB)

#include <linux/slob_def.h>

#else

#include <linux/slab_def.h>

#endif

一般系统默认#include

点击(此处)折叠或打开

static __always_inline void *kmalloc(size_t size, gfp_t flags)

{

struct kmem_cache *cachep;

void *ret;

if (__builtin_constant_p(size)) {

int i = 0;

if (!size)

return ZERO_SIZE_PTR;

#define CACHE(x) \

if (size <= x) \

goto found; \

else \

i++;

#include <linux/kmalloc_sizes.h> //这里查询申请的size在哪个范围
从32乘2递增。I每次加1.

#undef CACHE

return NULL;

found:

#ifdef CONFIG_ZONE_DMA

if (flags & GFP_DMA)

cachep = malloc_sizes[i].cs_dmacachep; //很明显如果定义了dma,并且设置了dma标志则优先从dma
cache里申请。malloc_sizes的初始化在slab.c里。可以具体分析一下。

else

#endif

cachep = malloc_sizes[i].cs_cachep; //从指定的cache链表分配内存,不浪费空间。

ret = kmem_cache_alloc_trace(cachep, flags, size);

return ret;

}

return __kmalloc(size, flags);

}

这里可以补充下代码关于kmalloc_sizes.h

点击(此处)折叠或打开

#if (PAGE_SIZE == 4096)

CACHE(32)

#endif

CACHE(64)

#if L1_CACHE_BYTES < 64

CACHE(96)

#endif

CACHE(128)

#if L1_CACHE_BYTES < 128

CACHE(192)

#endif

CACHE(256)

CACHE(512)

CACHE(1024)

CACHE(2048)

CACHE(4096)

CACHE(8192)

CACHE(16384)

CACHE(32768)

CACHE(65536)

CACHE(131072)

#if KMALLOC_MAX_SIZE >= 262144

CACHE(262144)

#endif

#if KMALLOC_MAX_SIZE >= 524288

CACHE(524288)

#endif

#if KMALLOC_MAX_SIZE >= 1048576

CACHE(1048576)

#endif

#if KMALLOC_MAX_SIZE >= 2097152

CACHE(2097152)

#endif

#if KMALLOC_MAX_SIZE >= 4194304

CACHE(4194304)

#endif

#if KMALLOC_MAX_SIZE >= 8388608

CACHE(8388608)

#endif

#if KMALLOC_MAX_SIZE >= 16777216

CACHE(16777216)

#endif

#if KMALLOC_MAX_SIZE >= 33554432

CACHE(33554432)

#endif

我们看到函数开头需要说明一下:

__builtin_constant_p 是编译器gcc内置函数,用于判断一个值是否为编译时常量,如果是常数,函数返回1 ,否则返回0。此内置函数的典型用法是在宏中用于手动编译时优化显然如果size为常数 则用__kmalloc(size, flags);申请内存.

它查询需要分配的内存在哪个系统cache然后调用

点击(此处)折叠或打开

#ifdef CONFIG_TRACING

extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t);

#else

static __always_inline void *

kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)

{

return kmem_cache_alloc(cachep, flags);

}

#endif

我们看具体代码:

点击(此处)折叠或打开

/**

* kmem_cache_alloc - Allocate an object

* @cachep: The cache to allocate from.

* @flags: See kmalloc().

*

* Allocate an object from this cache. The flags are only relevant

* if the cache has no available objects.

*/

void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)

{

void *ret = slab_alloc(cachep, flags, _RET_IP_);

trace_kmem_cache_alloc(_RET_IP_, ret, // 跟踪调试会用到

cachep->object_size, cachep->size, flags);

return ret;

}

它实际的分配是slab_alloc:

点击(此处)折叠或打开

static __always_inline void *

slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)

{

unsigned long save_flags;

void *objp;

flags &= gfp_allowed_mask; // 说明在gfp.h中 ,如下

点击(此处)折叠或打开

/*

* gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict
what

* GFP flags are used before interrupts are enabled. Once interrupts are

* enabled, it is set to __GFP_BITS_MASK while the
system is running. During

* hibernation, it is used by PM to avoid I/O during
memory allocation while

* devices are suspended.

*/

extern gfp_t gfp_allowed_mask;

lockdep_trace_alloc(flags); // 调试用

if (slab_should_failslab(cachep, flags))

return NULL;

cachep = memcg_kmem_get_cache(cachep, flags);

cache_alloc_debugcheck_before(cachep, flags);

local_irq_save(save_flags);

objp = __do_cache_alloc(cachep, flags);

local_irq_restore(save_flags);

objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);

kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags,

flags);

prefetchw(objp);

if (likely(objp))

kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size);

if (unlikely((flags & __GFP_ZERO) && objp))

memset(objp, 0, cachep->object_size);

return objp;

}

它调用objp = __do_cache_alloc(cachep, flags); 除了检查一些标志等继续调用

____cache_alloc(cachep, flags);

它是一个统一的接口 (有检测numa和uma ,linux默认是uma 除非指定了numa)

点击(此处)折叠或打开

static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t
flags)

{

void *objp;

struct array_cache *ac;

bool force_refill = false;

check_irq_off();

ac = cpu_cache_get(cachep);

if (likely(ac->avail)) {

ac->touched = 1;

objp = ac_get_obj(cachep, ac, flags, false);

/*

* Allow for the possibility all avail objects are not allowed

* by the current flags

*/

if (objp) {

STATS_INC_ALLOCHIT(cachep);

goto out;

}

force_refill = true;

}

STATS_INC_ALLOCMISS(cachep);

objp = cache_alloc_refill(cachep, flags, force_refill);

/*

* the 'ac' may be updated by cache_alloc_refill(),

* and kmemleak_erase() requires its correct value.

*/

ac = cpu_cache_get(cachep);

out:

/*

* To avoid a false negative, if an object that is in one
of the

* per-CPU caches is leaked, we need to make sure
kmemleak doesn't

* treat the array pointers as a reference to the object.

*/

if (objp)

kmemleak_erase(&ac->entry[ac->avail]);

return objp;

}

这里我们假定是第一次使用分配内存,那么根据在kmem_cache_init中的malloc_sizes[]的初始化,在kmalloc的时候返回的kmalloc_cache指针指向的cache中用到这样个函数:

点击(此处)折叠或打开

static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t
gfp)

{

if (slab_state >= FULL)

return enable_cpucache(cachep, gfp);

if (slab_state == DOWN) {

/*

* Note: Creation of first cache (kmem_cache).

* The setup_list3s is taken care

* of by the caller of __kmem_cache_create

*/

cachep->array[smp_processor_id()] = &initarray_generic.cache;

slab_state = PARTIAL;

} else if (slab_state == PARTIAL) {

/*

* Note: the second kmem_cache_create must create the cache

* that's used by kmalloc(24), otherwise the
creation of

* further caches will BUG().

*/

cachep->array[smp_processor_id()] = &initarray_generic.cache;

/*

* If the cache that's used by kmalloc(sizeof(kmem_list3)) is

* the second cache, then we need to set up
all its list3s,

* otherwise the creation of further caches will BUG().

*/

set_up_list3s(cachep, SIZE_AC);

if (INDEX_AC == INDEX_L3)

slab_state = PARTIAL_L3;

else

slab_state = PARTIAL_ARRAYCACHE;

} else {

/* Remaining boot caches */

cachep->array[smp_processor_id()] =

kmalloc(sizeof(struct arraycache_init), gfp);

if (slab_state == PARTIAL_ARRAYCACHE) {

set_up_list3s(cachep, SIZE_L3);

slab_state = PARTIAL_L3;

} else {

int node;

for_each_online_node(node) {

cachep->nodelists[node] =

kmalloc_node(sizeof(struct kmem_list3),

gfp, node);

BUG_ON(!cachep->nodelists[node]);

kmem_list3_init(cachep->nodelists[node]);

}

}

}

cachep->nodelists[numa_mem_id()]->next_reap =

jiffies + REAPTIMEOUT_LIST3 +

((unsigned long)cachep) % REAPTIMEOUT_LIST3;

cpu_cache_get(cachep)->avail = 0;

cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
// 1

cpu_cache_get(cachep)->batchcount = 1;

cpu_cache_get(cachep)->touched = 0;

cachep->batchcount = 1;

cachep->limit = BOOT_CPUCACHE_ENTRIES;

return 0;

}

我们知道不论array被赋了什么值,最后都要初始化avail等值.

所以如果array不可用,那么就会调用;当然如果array可用那么直接返回申请的obj的内存指针.

点击(此处)折叠或打开

static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t
flags,

bool force_refill)

{

int batchcount;

struct kmem_list3 *l3;

struct array_cache *ac;

int node;

check_irq_off();

node = numa_mem_id();

if (unlikely(force_refill))

goto force_grow;

retry:

ac = cpu_cache_get(cachep);

batchcount = ac->batchcount;

if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {

/*

* If there was little recent activity on this cache, then

* perform only a partial refill. Otherwise we could generate

* refill bouncing.

*/

batchcount = BATCHREFILL_LIMIT;

}

l3 = cachep->nodelists[node];

BUG_ON(ac->avail > 0 || !l3);

spin_lock(&l3->list_lock);

/* See if we can refill from the shared array */

if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) {

l3->shared->touched = 1;

goto alloc_done;

}

while (batchcount > 0) {

struct list_head *entry;

struct slab *slabp;

/* Get slab alloc is to come from. */

entry = l3->slabs_partial.next;

if (entry == &l3->slabs_partial) {

l3->free_touched = 1;

entry = l3->slabs_free.next;

if (entry == &l3->slabs_free)

goto must_grow;

}

slabp = list_entry(entry, struct slab, list);

check_slabp(cachep, slabp);

check_spinlock_acquired(cachep);

/*

* The slab was either on partial or free list so

* there must be at least one object available for

* allocation.

*/

BUG_ON(slabp->inuse >= cachep->num);

while (slabp->inuse < cachep->num && batchcount--) {

STATS_INC_ALLOCED(cachep);

STATS_INC_ACTIVE(cachep);

STATS_SET_HIGH(cachep);

ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp,

node));

}

check_slabp(cachep, slabp);

/* move slabp to correct slabp list: */

list_del(&slabp->list);

if (slabp->free == BUFCTL_END)

list_add(&slabp->list, &l3->slabs_full);

else

list_add(&slabp->list, &l3->slabs_partial);

}

must_grow:

l3->free_objects -= ac->avail;

alloc_done:

spin_unlock(&l3->list_lock);

if (unlikely(!ac->avail)) {

int x;

force_grow:

x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
// grow成功返回 1

/* cache_grow can reenable interrupts, then ac could change. */

ac = cpu_cache_get(cachep);

node = numa_mem_id();

/* no objects in sight? abort */

if (!x && (ac->avail == 0 || force_refill))

return NULL;

if (!ac->avail) /* objects
refilled by interrupt? */

goto retry;

}

ac->touched = 1;

return ac_get_obj(cachep, ac, flags, force_refill);

}

由于第一次使用nodelist上slab链表都为空,所以must_grow

它调用cache_grow,这个函数首先计算了slab着色处理。然后调用kmem_getpages申请page,大小根据cache->gfporder,它返回申请pages的虚拟地址.

点击(此处)折叠或打开

/*

* Grow (by 1) the number of slabs within a cache. This is called
by

* kmem_cache_alloc() when there are no active objs left in a
cache.

*/

static int cache_grow(struct kmem_cache *cachep,

gfp_t flags, int nodeid, void *objp)

{

struct slab *slabp;

size_t offset;

gfp_t local_flags;

struct kmem_list3 *l3;

/*

* Be lazy and only check for valid flags here, keeping it out of the

* critical path in kmem_cache_alloc().

*/

BUG_ON(flags & GFP_SLAB_BUG_MASK);

local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);

/* Take the l3 list lock to change the colour_next on this node */

check_irq_off();

l3 = cachep->nodelists[nodeid];

spin_lock(&l3->list_lock);

/* Get colour for the slab, and cal
the next value. */

offset = l3->colour_next; // default
0

l3->colour_next++;

if (l3->colour_next >= cachep->colour)

l3->colour_next = 0;

spin_unlock(&l3->list_lock);

offset *= cachep->colour_off; // first time ,offset is 0 ;

if (local_flags & __GFP_WAIT)

local_irq_enable();

/*

* The test for missing atomic flag is performed here, rather than

* the more obvious place, simply to reduce the critical path length

* in kmem_cache_alloc(). If a
caller is seriously mis-behaving they

* will eventually be caught here (where it matters).

*/

kmem_flagcheck(cachep, flags);

/*

* Get mem for the objs. Attempt to allocate a physical
page from

* 'nodeid'.

*/

if (!objp)

objp = kmem_getpages(cachep, local_flags, nodeid);

if (!objp)

goto failed;

/* Get slab management. */

slabp = alloc_slabmgmt(cachep, objp, offset,

local_flags & ~GFP_CONSTRAINT_MASK, nodeid);

if (!slabp)

goto opps1;

slab_map_pages(cachep, slabp, objp);

cache_init_objs(cachep, slabp);

if (local_flags & __GFP_WAIT)

local_irq_disable();

check_irq_off();

spin_lock(&l3->list_lock);

/* Make slab active. */

list_add_tail(&slabp->list, &(l3->slabs_free));
// 把新申请的slab添加到nodelist的slabs_free链表。

STATS_INC_GROWN(cachep);

l3->free_objects += cachep->num;
//初始化可用的对象即每个slab可以包含的obj数目

spin_unlock(&l3->list_lock);

return 1;

opps1:

kmem_freepages(cachep, objp);

failed:

if (local_flags & __GFP_WAIT)

local_irq_disable();

return 0;

}

而关于slab着色跟硬件缓冲有关,为了尽量避免缓存冲突不命中问题,提高效率(cache_line问题)。可以参考《深入理解计算机系统》。

具体操作见:

点击(此处)折叠或打开

/*

* Get the memory for a slab management obj.

* For a slab cache when the slab descriptor is off-slab, slab
descriptors

* always come from malloc_sizes caches. The slab descriptor cannot

* come from the same cache which is getting created because,

* when we are searching for an appropriate cache for these

* descriptors in kmem_cache_create, we search through the malloc_sizes array.

* If we are creating a malloc_sizes cache here it would not be visible to

* kmem_find_general_cachep till the initialization is complete.

* Hence we cannot have slabp_cache same as the original cache.

*/

static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,

int colour_off, gfp_t local_flags,

int nodeid)

{

struct slab *slabp;

if (OFF_SLAB(cachep)) {

// 关于OFF_SLAB问题 可以看代码:

点击(此处)折叠或打开

CFLGS_OFF_SLAB 在__kmem_cache_create

/*

* Determine if the slab management is 'on' or 'off' slab.

* (bootstrapping cannot cope with offslab caches so don't do

* it too early on. Always use on-slab management
when

* SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)

*/

if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init &&

!(flags & SLAB_NOLEAKTRACE))

/*

* Size is large, assume best to place the slab management obj

* off-slab (should allow better packing of objs).

*/

flags |= CFLGS_OFF_SLAB;

/* Slab management obj is off-slab. */

slabp = kmem_cache_alloc_node(cachep->slabp_cache,

local_flags, nodeid);

/*

* If the first object in the slab is leaked (it's
allocated

* but no one has a reference to it), we want to make
sure

* kmemleak does not treat the ->s_mem pointer as a reference

* to the object. Otherwise we will not report the leak.

*/

kmemleak_scan_area(&slabp->list, sizeof(struct
list_head),

local_flags);

if (!slabp)

return NULL;

} else {

slabp = objp + colour_off; // 在__kmem_cache_create中cachep->colour_off = cache_line_size();
// 在cache.h中#define cache_line_size() L1_CACHE_BYTES; 一般为32B 大小.

// cachep->colour = left_over / cachep->colour_off;

colour_off += cachep->slab_size;

}

slabp->inuse = 0; // num of objs active in slab

slabp->colouroff = colour_off; //第一个obj相对page地址的偏移

slabp->s_mem = objp + colour_off; //第一个obj的地址

slabp->nodeid = nodeid;

slabp->free = 0;

return slabp;

}

我们看看另外一个很重要的操作:

点击(此处)折叠或打开

static void cache_init_objs(struct kmem_cache *cachep,

struct slab *slabp)

{

int i;

for (i = 0; i < cachep->num; i++) {

void *objp = index_to_obj(cachep, slabp, i);

#if DEBUG

/* need to poison the objs? */

if (cachep->flags & SLAB_POISON)

poison_obj(cachep, objp, POISON_FREE);

if (cachep->flags & SLAB_STORE_USER)

*dbg_userword(cachep, objp) = NULL;

if (cachep->flags & SLAB_RED_ZONE) {

*dbg_redzone1(cachep, objp) = RED_INACTIVE;

*dbg_redzone2(cachep, objp) = RED_INACTIVE;

}

/*

* Constructors are not allowed to allocate memory from the same

* cache which they are a constructor for. Otherwise, deadlock.

* They must also be threaded.

*/

if (cachep->ctor && !(cachep->flags & SLAB_POISON))

cachep->ctor(objp + obj_offset(cachep));

if (cachep->flags & SLAB_RED_ZONE) {

if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)

slab_error(cachep, "constructor overwrote the"

" end of an object");

if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)

slab_error(cachep, "constructor overwrote the"

" start of an object");

}

if ((cachep->size % PAGE_SIZE) == 0 &&

OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)

kernel_map_pages(virt_to_page(objp),

cachep->size / PAGE_SIZE, 0);

#else

if (cachep->ctor)

cachep->ctor(objp); // 根据构造函数初始化对象

#endif

slab_bufctl(slabp)[i] = i + 1;
// init bufctl数组 1、2、3、4 ..... 最后一个设置成为BUFCTL_END

}

slab_bufctl(slabp)[i - 1] = BUFCTL_END;

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: