您的位置:首页 > 运维架构 > Linux

linux内存管理之kmalloc(2)

2015-10-02 00:20 495 查看
linux内存管理之kmalloc(2) http://blog.chinaunix.net/uid-20786208-id-4785655.html
上一篇文章中简单说了下slab分配器下kmalloc是如何分配内存的。在看cache_alloc_refill这个函数的时候逻辑上还有一些困惑。

点击(此处)折叠或打开

static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t
flags,

bool force_refill)

{

int batchcount;

struct kmem_list3 *l3;

struct array_cache *ac;

int node;

check_irq_off();

node = numa_mem_id();

if (unlikely(force_refill))

goto force_grow;

retry:

ac = cpu_cache_get(cachep);

batchcount = ac->batchcount;

if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {

/*

* If there was little recent activity on this cache, then

* perform only a partial refill. Otherwise we could generate

* refill bouncing.

*/

batchcount = BATCHREFILL_LIMIT;

}

l3 = cachep->nodelists[node];

BUG_ON(ac->avail > 0 || !l3);

spin_lock(&l3->list_lock);

/* See if we can refill from the shared array */

if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) {

l3->shared->touched = 1;

goto alloc_done;

}

while (batchcount > 0) {

struct list_head *entry;

struct slab *slabp;

/* Get slab alloc is to come from. */

entry = l3->slabs_partial.next;

if (entry == &l3->slabs_partial) {

l3->free_touched = 1;

entry = l3->slabs_free.next;

if (entry == &l3->slabs_free)

goto must_grow;

}

slabp = list_entry(entry, struct slab, list);

check_slabp(cachep, slabp);

check_spinlock_acquired(cachep);

/*

* The slab was either on partial or free list so

* there must be at least one object available for

* allocation.

*/

BUG_ON(slabp->inuse >= cachep->num);

while (slabp->inuse < cachep->num && batchcount--) {

STATS_INC_ALLOCED(cachep);

STATS_INC_ACTIVE(cachep);

STATS_SET_HIGH(cachep);

ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp,

node));

}

check_slabp(cachep, slabp);

/* move slabp to correct slabp list: */

list_del(&slabp->list);

if (slabp->free == BUFCTL_END)

list_add(&slabp->list, &l3->slabs_full);

else

list_add(&slabp->list, &l3->slabs_partial);

}

must_grow:

l3->free_objects -= ac->avail;

alloc_done:

spin_unlock(&l3->list_lock);

if (unlikely(!ac->avail)) {

int x;

force_grow:

x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);

/* cache_grow can reenable interrupts, then ac could change. */

ac = cpu_cache_get(cachep);

node = numa_mem_id();

/* no objects in sight? abort */

if (!x && (ac->avail == 0 || force_refill))

return NULL;

if (!ac->avail) /* objects
refilled by interrupt? */

goto retry;

}

ac->touched = 1;

return ac_get_obj(cachep, ac, flags, force_refill);

}

主要是关于 batchcount = ac->batchcount;
的问题。在默认初始化的时候即在kmem_cache_init中系统的cache都会调用到__kmem_cache_create中setup_cpu_cache的有这样一段代码:

点击(此处)折叠或打开

cpu_cache_get(cachep)->avail = 0;

cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;

cpu_cache_get(cachep)->batchcount = 1;

cpu_cache_get(cachep)->touched = 0;

cachep->batchcount = 1;

cachep->limit = BOOT_CPUCACHE_ENTRIES;

return 0;

那么我是不是就可以认为ac->batchcount的值就是1了呢?那么 ac_put_obj的时候只放一个obj到array中。每次都这样,那么在__cache_alloc中

点击(此处)折叠或打开

ac = cpu_cache_get(cachep);

if (likely(ac->avail)) {

ac->touched = 1;

objp = ac_get_obj(cachep, ac, flags, false);

/*

* Allow for the possibility all avail objects are not allowed

* by the current flags

*/

if (objp) {

STATS_INC_ALLOCHIT(cachep);

goto out;

}

force_refill = true;

}

它的意义又何在呢? 因为batchcount为1的话,每次放入一个obj到array 设置avail从0到1,但是get一个obj后,avail又为0了。当然这样效率很低。

后来才发现是自己代码没看全- -, 我们看这样一段代码它在kmem_cache_init初始化后,调用的

点击(此处)折叠或打开

void __init kmem_cache_init_late(void)

{

struct kmem_cache *cachep;

slab_state = UP;

/* 6) resize the head arrays to their final sizes */

mutex_lock(&slab_mutex);

list_for_each_entry(cachep, &slab_caches, list)

if (enable_cpucache(cachep, GFP_NOWAIT))

BUG();

mutex_unlock(&slab_mutex);

/* Annotate slab for lockdep -- annotate the malloc
caches */

init_lock_keys();

/* */

slab_state = FULL;

/*

* Register a cpu startup notifier callback that initializes

* cpu_cache_get for all new cpus

*/

register_cpu_notifier(&cpucache_notifier);

#ifdef CONFIG_NUMA

/*

* Register a memory hotplug callback that initializes and frees

* nodelists.

*/

hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);

#endif

/*

* The reap timers are started later, with a module init call: That part

* of the kernel is not yet operational.

*/

}

这个函数就是把slab_caches链表上的所有cache都调用enable_cpucache(cachep, GFP_NOWAIT)一遍!

点击(此处)折叠或打开

/* Called with slab_mutex held always */

static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)

{

int err;

int limit = 0;

int shared = 0;

int batchcount = 0;

if (!is_root_cache(cachep)) {

struct kmem_cache *root = memcg_root_cache(cachep);

limit = root->limit;

shared = root->shared;

batchcount = root->batchcount;

}

if (limit && shared && batchcount)

goto skip_setup;

/*

* The head array serves three purposes:

* - create a LIFO ordering, i.e. return objects
that are cache-warm

* - reduce the number of spinlock operations.

* - reduce the number of linked list operations on the slab and

* bufctl chains: array operations are cheaper.

* The numbers are guessed, we should auto-tune as described by

* Bonwick.

*/

if (cachep->size > 131072)
// size 大一128k 小于page_size 则limit为1

limit = 1;

else if (cachep->size > PAGE_SIZE)

limit = 8;

else if (cachep->size > 1024)

limit = 24;

else if (cachep->size > 256)

limit = 54;

else

limit = 120;

/*

* CPU bound tasks (e.g. network routing) can exhibit
cpu bound

* allocation behaviour: Most allocs on one cpu, most free operations

* on another cpu. For these cases, an efficient
object passing between

* cpus is necessary. This is provided by a shared array. The array

* replaces Bonwick's magazine layer.

* On uniprocessor, it's functionally equivalent (but
less efficient)

* to a larger limit. Thus disabled by default.

*/

shared = 0;

if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1)
// smp 下 shared为8 ,单核为0

shared = 8;

#if DEBUG

/*

* With debugging enabled, large batchcount lead to excessively long

* periods with disabled local interrupts. Limit the batchcount

*/

if (limit > 32)

limit = 32;

#endif

batchcount = (limit + 1) / 2;

skip_setup:

err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
//设置 参数值到cache里

if (err)

printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",

cachep->name, -err);

return err;

}

对我们看到了limit 、shared 、 batchcount的新初始化.

点击(此处)折叠或打开

static int do_tune_cpucache(struct kmem_cache *cachep, int limit,

int batchcount, int shared, gfp_t gfp)

{

int ret;

struct kmem_cache *c = NULL;

int i = 0;

ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
// 设置传递进来的cache的东西

if (slab_state < FULL)

return ret;

if ((ret < 0) || !is_root_cache(cachep))

return ret;

VM_BUG_ON(!mutex_is_locked(&slab_mutex));

for_each_memcg_cache_index(i) {

c = cache_from_memcg(cachep, i);

if (c)

/* return value determined by the parent cache only */

__do_tune_cpucache(c, limit, batchcount, shared, gfp);

}

return ret;

}

而具体实现在

点击(此处)折叠或打开

/* Always called with the slab_mutex held */

static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,

int batchcount, int shared, gfp_t gfp)

{

struct ccupdate_struct *new;

// 说明一下上面的结构体

点击(此处)折叠或打开

struct ccupdate_struct {

struct kmem_cache *cachep;

struct array_cache *new[0];

};

int i;

new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct
array_cache *), // 这个函数用完new就释放了。说明它只是起到一个中转的作用.

gfp);

if (!new)

return -ENOMEM;

for_each_online_cpu(i) {

new->new[i] = alloc_arraycache(cpu_to_mem(i), limit,

batchcount, gfp);

if (!new->new[i]) {

for (i--; i >= 0; i--)

kfree(new->new[i]);

kfree(new);

return -ENOMEM;

}

}

new->cachep = cachep;

on_each_cpu(do_ccupdate_local, (void *)new, 1);
// 关键点: 每个cpu上都调用do_ccupdate_local处理new。

check_irq_on();

cachep->batchcount = batchcount;

cachep->limit = limit;

cachep->shared = shared;

for_each_online_cpu(i) {

struct array_cache *ccold = new->new[i];

if (!ccold)

continue;

spin_lock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);

free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i));
//

spin_unlock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);

kfree(ccold);

}

kfree(new);

return alloc_kmemlist(cachep, gfp);

}

我们就看看do_ccupdate_local做了什么

点击(此处)折叠或打开

static void do_ccupdate_local(void *info)

{

struct ccupdate_struct *new = info;

struct array_cache *old;

check_irq_off();

old = cpu_cache_get(new->cachep);

new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];//
由于之前 new->cache已经指向了我们的cache,所以这里操作的是我们cache的array指向新的地方.

// 而new->new这个array的初始化是在申请它的时候 见上个函数里的alloc_arraycache:

点击(此处)折叠或打开

static struct array_cache *alloc_arraycache(int node, int entries,

int batchcount, gfp_t gfp)

{

int memsize = sizeof(void *) * entries + sizeof(struct
array_cache);

struct array_cache *nc = NULL;

nc = kmalloc_node(memsize, gfp, node);

/*

* The array_cache structures contain pointers to free object.

* However, when such objects are allocated or transferred to another

* cache the pointers are not cleared and they could be counted as

* valid references during a kmemleak scan. Therefore, kmemleak must

* not scan such objects.

*/

kmemleak_no_scan(nc);

if (nc) {

nc->avail = 0;

nc->limit = entries;

nc->batchcount = batchcount;

nc->touched = 0;

spin_lock_init(&nc->lock);

}

return nc;

}

new->new[smp_processor_id()] = old;

}

这样就和函数cache_alloc_refill接起来了

我们可以看看实际的内核开启slab的信息:

点击(此处)折叠或打开

cat /proc/slabinfo

slabinfo - version: 2.1

# name <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab> : tunables <limit> <batchcount> <sharedfactor> : slabdata <active_slabs> <num_slabs> <sharedavail>

nf_conntrack_expect 0 0 152 26 1 : tunables 120 60 8 : slabdata 0 0 0

nf_conntrack_8050c5f0 2 26 296 13 1 : tunables 54 27 8 : slabdata 2 2 0
bridge_fdb_cache 4 78 48 78 1 : tunables 120 60 8 : slabdata 1 1 0

fib6_nodes 12 113 32 113 1 : tunables 120 60 8 : slabdata 1 1 0

ip6_dst_cache 25 57 208 19 1 : tunables 120 60 8 : slabdata 3 3 0

ip6_mrt_cache 0 0 112 35 1 : tunables 120 60 8 : slabdata 0 0 0

RAWv6 8 15 720 5 1 : tunables 54 27 8 : slabdata 3 3 0

UDPLITEv6 0 0 688 11 2 : tunables 54 27 8 : slabdata 0 0 0

UDPv6 3 22 688 11 2 : tunables 54 27 8 : slabdata 2 2 0

tw_sock_TCPv6 0 0 144 27 1 : tunables 120 60 8 : slabdata 0 0 0

request_sock_TCPv6 0 0 112 35 1 : tunables 120 60 8 : slabdata 0 0 0

TCPv6 5 6 1328 3 1 : tunables 24 12 8 : slabdata 2 2 0

ubi_wl_entry_slab 463 580 24 145 1 : tunables 120 60 8 : slabdata 4 4 0

sd_ext_cdb 2 113 32 113 1 : tunables 120 60 8 : slabdata 1 1 0

fuse_request 0 0 384 10 1 : tunables 54 27 8 : slabdata 0 0 0

fuse_inode 0 0 416 9 1 : tunables 54 27 8 : slabdata 0 0 0

jffs2_inode_cache 15 145 24 145 1 : tunables 120 60 8 : slabdata 1 1 0

jffs2_node_frag 130 290 24 145 1 : tunables 120 60 8 : slabdata 2 2 0

uid_cache 0 0 48 78 1 : tunables 120 60 8 : slabdata 0 0 0

UNIX 24 32 480 8 1 : tunables 54 27 8 : slabdata 4 4 0

ip_mrt_cache 0 0 96 40 1 : tunables 120 60 8 : slabdata 0 0 0

UDP-Lite 0 0 560 7 1 : tunables 54 27 8 : slabdata 0 0 0

tcp_bind_bucket 6 113 32 113 1 : tunables 120 60 8 : slabdata 1 1 0

inet_peer_cache 8 24 160 24 1 : tunables 120 60 8 : slabdata 1 1 0

ip_fib_trie 7 113 32 113 1 : tunables 120 60 8 : slabdata 1 1 0

ip_fib_alias 8 145 24 145 1 : tunables 120 60 8 : slabdata 1 1 0

ip_dst_cache 6 27 144 27 1 : tunables 120 60 8 : slabdata 1 1 0

PING 0 0 528 7 1 : tunables 54 27 8 : slabdata 0 0 0

RAW 4 7 544 7 1 : tunables 54 27 8 : slabdata 1 1 0

UDP 13 14 560 7 1 : tunables 54 27 8 : slabdata 2 2 0

tw_sock_TCP 0 0 112 35 1 : tunables 120 60 8 : slabdata 0 0 0

request_sock_TCP 0 0 80 48 1 : tunables 120 60 8 : slabdata 0 0 0

TCP 1 6 1184 6 2 : tunables 24 12 8 : slabdata 1 1 0

......

size-2048(DMA) 0 0 2048 2 1 : tunables 24 12 8 : slabdata 0 0 0

size-2048 192 192 2048 2 1 : tunables 24 12 8 : slabdata 96 96 0

size-1024(DMA) 0 0 1024 4 1 : tunables 54 27 8 : slabdata 0 0 0

size-1024 215 216 1024 4 1 : tunables 54 27 8 : slabdata 54 54 0

size-512(DMA) 0 0 512 8 1 : tunables 54 27 8 : slabdata 0 0 0

size-512 601 624 512 8 1 : tunables 54 27 8 : slabdata 78 78 0

size-256(DMA) 0 0 256 15 1 : tunables 120 60 8 : slabdata 0 0 0

size-256 1234 1245 256 15 1 : tunables 120 60 8 : slabdata 83 83 0

size-192(DMA) 0 0 256 15 1 : tunables 120 60 8 : slabdata 0 0 0

size-192 287 300 256 15 1 : tunables 120 60 8 : slabdata 20 20 0

size-128(DMA) 0 0 128 30 1 : tunables 120 60 8 : slabdata 0 0 0

size-128 1890 1890 128 30 1 : tunables 120 60 8 : slabdata 63 63 0

size-96(DMA) 0 0 128 30 1 : tunables 120 60 8 : slabdata 0 0 0

size-96 930 930 128 30 1 : tunables 120 60 8 : slabdata 31 31 0

size-64(DMA) 0 0 128 30 1 : tunables 120 60 8 : slabdata 0 0 0

size-32(DMA) 0 0 128 30 1 : tunables 120 60 8 : slabdata 0 0 0

size-64 1577 1650 128 30 1 : tunables 120 60 8 : slabdata 55 55 0

size-32 6213 6300 128 30 1 : tunables 120 60 8 : slabdata 210 210 0

kmem_cache 150 160 96 40 1 : tunables 120 60 8 : slabdata 4 4 0

或许你看ubuntu系统的时候发现limit batchcount值为0 ,其实它是用了slub分配器.在slub.c中

点击(此处)折叠或打开

void __init kmem_cache_init_late(void)

{

}

这里顺便说明一下关于slab、slub、slob的简单区别:(具体如何实现的请参考内核代码slab.c /slub.c/slob.c)

slab是slub和slob的基础。

SLOB的目标是针对嵌入式系统的,主要是适用于那些内存非常有限的系统,比如32MB以下的内存,它不太注重large smp系统,虽然最近在这方面有一些小的改进

SLUB allocator,用于替代 slab 代码。通过取消了大量的队列和相关开销、简化 slab 的结构,SLUB 承诺提供更好的性能和更好的系统可伸缩性,并且可以同时保持现有的 slab 分配器接口

说了这么多,我们用个图来简单描述下slab机制:

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: