您的位置:首页 > 运维架构 > Linux

linux内存管理--伙伴系统分配内存准备工作

2016-04-27 22:26 1531 查看
linux内核内存管理的代码改变的有点大,主要是细节方面;

首先从 __get_free_pages()开始总结起:

unsigned long  __get_free_pages(grp_t  gfp_mask, unsigned int order)

/*
* Common helper functions.
*/
unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
{
struct page *page;

/*
* __get_free_pages() returns a 32-bit address, which cannot represent
* a highmem page
*/
VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);//返回逻辑地址,所以不能用highmem区域未映射的地址

page = alloc_pages(gfp_mask, order);
if (!page)
return 0;
return (unsigned long) page_address(page);//物理page转换为线性逻辑地址
}


struct page*  alloc_pages()

static inline struct page *
alloc_pages(gfp_t gfp_mask, unsigned int order)//alloc_pages()函数可以分配任何zone的页
{
return alloc_pages_current(gfp_mask, order);
}

struct page *alloc_pages_current(gfp_t gfp, unsigned order)
{
struct mempolicy *pol = get_task_policy(current);//得到current中的mempolicy,如果为空,则得到&preferred_node_policy[node]中的mempolicy
struct page *page;
unsigned int cpuset_mems_cookie;

if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))//pol空,在中断上下文,指定到本地上获取page,则用默认的策略
pol = &default_policy;

retry_cpuset:
cpuset_mems_cookie = get_mems_allowed();

/*
* No reference counting needed for current->mempolicy
* nor system default_policy
*/
if (pol->mode == MPOL_INTERLEAVE)//交错策略
page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
else
page = __alloc_pages_nodemask(gfp, order,
policy_zonelist(gfp, pol, numa_node_id()),
policy_nodemask(gfp, pol));

if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
goto retry_cpuset;

return page;
}


上面代码中:
cpuset_mems_cookie = get_mems_allowed();

xxx
if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))

        goto retry_cpuset;

是成对出现的;这是个顺序锁,这里锁住的应该是 current->mems_allowed 成员,因为在分配内存页时都要考虑到该cpu是否可以在某个节点上分配。如果在分配期间修改了mems_allowed,就算分配到了该page,那该cpu也不一定能使用;在分配完内存页后还要再判断下该mems_allowed是否已经改变;如果改变了,那分配的页也就失效了,要重新再分配一次;

下面说下分配策略,这个分配策略主要是获取到节点的编号(也就是得到 pgdat 结构体);下面是分配策略的结构体

struct mempolicy {
atomic_t refcnt;
unsigned short mode;    /* See MPOL_* above */
unsigned short flags;   /* See set_mempolicy() MPOL_F_* above */
union {
short        preferred_node; /* preferred */
nodemask_t   nodes;     /* interleave/bind */
/* undefined for default */
} v;
union {
nodemask_t cpuset_mems_allowed; /* relative to these nodes */
nodemask_t user_nodemask;   /* nodemask passed by user */
} w;
};
成员的含义,注释已经说的比较明白了;

有几种分配策略
enum {
MPOL_DEFAULT,
MPOL_PREFERRED,
MPOL_BIND,
MPOL_INTERLEAVE,
MPOL_LOCAL,
MPOL_MAX,   /* always last member of enum */
};


交错策略:

接着上代码  pol->mode == MPOL_INTERLEAVE  

page = alloc_page_interleave(gfp, order, interleave_nodes(pol));

static unsigned interleave_nodes(struct mempolicy *policy)
{
unsigned nid, next;
struct task_struct *me = current;

nid = me->il_next;
next = next_node(nid, policy->v.nodes);//policy->v.nodes是一个表示nid是否存在的bit映射,从nid位置开始再找出下一个为1的bit位
if (next >= MAX_NUMNODES)//如果超出nid节点最大编号范围,则从头再找
next = first_node(policy->v.nodes);
if (next < MAX_NUMNODES)
me->il_next = next;
return nid;
}

static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
unsigned nid)
{
struct zonelist *zl;
struct page *page;

zl = node_zonelist(nid, gfp);// NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags); 如果指定了__GFP_THISNODE,则用pgdat(nid)->zonelists[1],否则用zonelists[0]
page = __alloc_pages(gfp, order, zl);//这是主要分析的
if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
return page;
}


alloc_page_interleave()函数:
    首选通过 interleave_nodes()从current中得到nid,也就是内存节点编号;

    再根据nid 和gfp 得到zonelist; 其中 zonelist[0]是按离当前cpu节点的远近顺序存放的各个内存节点的zone,而zonelist[1]则是存放自己当前节点的zone(可以看初始化zonelist部分)
    最后调用:__alloc_pages_nodemask(gfp_mask, order, zonelist, NULL);

绑定策略、首选策略:

page = __alloc_pages_nodemask(gfp, order,
policy_zonelist(gfp, pol, numa_node_id()),
policy_nodemask(gfp, pol));

static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
int nd)
{
switch (policy->mode) {
case MPOL_PREFERRED://首选模式
if (!(policy->flags & MPOL_F_LOCAL))
nd = policy->v.preferred_node;//这时候mempolicy中的v成员就代表一个节点了,其他模式下则代表节点的mask
break;
case MPOL_BIND:
/*
* Normally, MPOL_BIND allocations are node-local within the
* allowed nodemask.  However, if __GFP_THISNODE is set and the
* current node isn't part of the mask, we use the zonelist for
* the first node in the mask instead.
*/
if (unlikely(gfp & __GFP_THISNODE) &&
unlikely(!node_isset(nd, policy->v.nodes)))//指定本地分配,并且nd是不存在
nd = first_node(policy->v.nodes);//从v中得到可用的node所在的bit位
break;
default:
BUG();
}
return node_zonelist(nd, gfp);//和上面一样得到zonelists[]
}

static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
{
/* Lower zones don't get a nodemask applied for MPOL_BIND */
if (unlikely(policy->mode == MPOL_BIND) &&
apply_policy_zone(policy, gfp_zone(gfp)) &&
cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))//和current->mems_allowed是否相交
*if policy->v.nodes has movable memory only,
* we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
*
* policy->v.nodes is intersect with node_states[N_MEMORY].
* so if the following test faile, it implies
* policy->v.nodes has movable memory only.中
return &policy->v.nodes;

return NULL;
}


policy_zonelist(gfp, pol, numa_node_id()):
    首选,如果是首选模式,那policy->v.preferred_node就代表了一个节点,而不是节点的mask;
    然后,如果是绑定模式,如果指定了本地分配,并且当前节点又不可用,则从policy中找到第一个可用节点;
    最后,其他情况都使用numa_node_id()获取的节点

struct page * __alloc_pages_nodemask(gfp_t  gfp_mask, unsigned int order,  struct  zonelist *zonelist, nodemask_t  *nodemask)

/*
* This is the 'heart' of the zoned buddy allocator.
*/
struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, nodemask_t *nodemask)
{
enum zone_type high_zoneidx = gfp_zone(gfp_mask);//获取允许zone的最大的zone,一般是从high--normal--dma
struct zone *preferred_zone;
struct page *page = NULL;
int migratetype = allocflags_to_migratetype(gfp_mask);<span style="font-family: Arial, Helvetica, sans-serif;">//迁移类型,主要是zone->free_area.free_list[迁移类型]</span>

unsigned int cpuset_mems_cookie;
int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;//设置分配标识,<span style="font-family: Arial, Helvetica, sans-serif;">low水位,内存节点要对应cpu集合</span>
struct mem_cgroup *memcg = NULL;

gfp_mask &= gfp_allowed_mask;//屏蔽一些无用的标识

lockdep_trace_alloc(gfp_mask);

might_sleep_if(gfp_mask & __GFP_WAIT);

if (should_fail_alloc_page(gfp_mask, order))
return NULL;

/*
* Check the zones suitable for the gfp_mask contain at least one
* valid zone. It's possible to have an empty zonelist as a result
* of GFP_THISNODE and a memoryless node
*/
if (unlikely(!zonelist->_zonerefs->zone))//备用列表中没有zone
return NULL;

/*
* Will only have any effect when __GFP_KMEMCG is set.  This is
* verified in the (always inline) callee
*/
if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
return NULL;

retry_cpuset:
cpuset_mems_cookie = get_mems_allowed();

/* The preferred zone is used for statistics later *///循环备用列表中的所有zone,找到一个最合适的zone
first_zones_zonelist(zonelist, high_zoneidx,
nodemask ? : &cpuset_current_mems_allowed,
&preferred_zone);
if (!preferred_zone)
goto out;

#ifdef CONFIG_CMA
if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
alloc_flags |= ALLOC_CMA;
#endif
/* First allocation attempt *///快速分配内存页
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
zonelist, high_zoneidx, alloc_flags,
preferred_zone, migratetype);
if (unlikely(!page)) {
/*
* Runtime PM, block IO and its error handling path
* can deadlock because I/O on the device might not
* complete.
*/
gfp_mask = memalloc_noio_flags(gfp_mask);
page = __alloc_pages_slowpath(gfp_mask, order,//这个是慢速的分配
zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype);
}

trace_mm_page_alloc(page, order, gfp_mask, migratetype);

out:
/*
* When updating a task's mems_allowed, it is possible to race with
* parallel threads in such a way that an allocation can fail while
* the mask is being updated. If a page allocation is about to fail,
* check if the cpuset changed during allocation and if so, retry.
*/
if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
goto retry_cpuset;

memcg_kmem_commit_charge(page, memcg, order);

return page;
}


下面是反内存碎片设置的,改变了以前的zone->free_area.free_list[迁移类型],以前free_list就直接链接page块,而现在是各种迁移类型链表的数组了。

int migratetype = allocflags_to_migratetype(gfp_mask);

/* Convert GFP flags to their corresponding migrate type */
static inline int allocflags_to_migratetype(gfp_t gfp_flags)
{
WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);//GFP_MOVABLE包含可回收的和可移动的页

if (unlikely(page_group_by_mobility_disabled))//这表示是否开启反内存碎片,内存少的一般就不开启
return MIGRATE_UNMOVABLE;//其实就是0

/* Group based on mobility */
return (((gfp_flags & __GFP_MOVABLE) != 0) << 1) |
((gfp_flags & __GFP_RECLAIMABLE) != 0);//返回 回收或可移动 的标识位数,在分配内存是使用
}


static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
enum zone_type highest_zoneidx,
nodemask_t *nodes,
struct zone **zone){
return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes,
zone);
}

/* Returns the next zone at or below highest_zoneidx in a zonelist */由于highest_zoneidx是最大的区域,所以只要小于该区域就是首选zone
struct zoneref *next_zones_zonelist(struct zoneref *z,//这时候z里面包含了一系列的内存区域zones
enum zone_type highest_zoneidx,
nodemask_t *nodes,
struct zone **zone){
/*
* Find the next suitable zone to use for the allocation.
* Only filter based on nodemask if it's set
*/
if (likely(nodes == NULL))//交错策略,其实表示所有zone都是有效的
while (zonelist_zone_idx(z) > highest_zoneidx)//得到首选节点
z++;
else
while (zonelist_zone_idx(z) > highest_zoneidx ||
(z->zone && !zref_in_nodemask(z, nodes)))
z++;

*zone = zonelist_zone(z);
return z;       }//这应该可以说是得到zonelists中的首选节点
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息