您的位置:首页 > 其它

进程调度 ---CFS调度

2016-05-04 16:55 295 查看
完全公平算法CFS调度分析方法:
是依据进程就绪队列中等待的时间长短来进行调度选择,即在就绪队列中等待时间越长的进程得到调度的机会就越大,否则,机会就越小;CFS在所有可运行进程总数基础上计算出一个进程应该运行多久,而不是依靠nice值来计算时间片;nice值在CFS中被作为进程获得的处理器运行比的权重:越高的nice值(越低的优先级)进程获得更低的处理器使用权重,这是相对默认nice值进程的进程而言的;相反,更低的nice值(越高的优先级)的进程获得更高的处理器使用权重。
在初始化时:

__init void init_sched_fair_class(void)
{
#ifdef CONFIG_SMP
open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);   定时器为调度的前提

#ifdef CONFIG_NO_HZ
nohz.next_balance = jiffies;
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
cpu_notifier(sched_ilb_notifier, 0);
#endif
#endif /* SMP */

}

在start_kernel()--------------->调用sched_init();

                                             --------------------->init_cfs_rq(&rq->cfs);
                            ----------------------->init_rt_rq(&rq->rt, rq);

/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load;/*运行负载*/  
unsigned int nr_running, h_nr_running;

u64 exec_clock;
u64 min_vruntime;/*保存的最小运行时间*/ 
#ifndef CONFIG_64BIT
u64 min_vruntime_copy;
#endif

struct rb_root tasks_timeline;
struct rb_node *rb_leftmost;

/*
* 'curr' points to currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
struct sched_entity *curr, *next, *last, *skip;

#ifdef	CONFIG_SCHED_DEBUG
unsigned int nr_spread_over;
#endif

#ifdef CONFIG_FAIR_GROUP_SCHED
struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */

/*
* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
* a hierarchy). Non-leaf lrqs hold other higher schedulable entities
* (like users, containers etc.)
*
* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
* list is used during load balance.
*/
int on_list;
struct list_head leaf_cfs_rq_list;
struct task_group *tg;	/* group that "owns" this runqueue */

#ifdef CONFIG_SMP
/*
*   h_load = weight * f(tg)
*
* Where f(tg) is the recursive weight fraction assigned to
* this group.
*/
unsigned long h_load;

/*
* Maintaining per-cpu shares distribution for group scheduling
*
* load_stamp is the last time we updated the load average
* load_last is the last time we updated the load average and saw load
* load_unacc_exec_time is currently unaccounted execution time
*/
u64 load_avg;
u64 load_period;
u64 load_stamp, load_last, load_unacc_exec_time;

unsigned long load_contribution;
#endif /* CONFIG_SMP */
#ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled;
u64 runtime_expires;
s64 runtime_remaining;

u64 throttled_timestamp;
int throttled, throttle_count;
struct list_head throttled_list;
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};

调度实体:

struct sched_entity {
struct load_weight	load;		/* for load-balancing */
struct rb_node		run_node;
struct list_head	group_node;
unsigned int		on_rq;//该se是否在rq上  

u64			exec_start;//当前cfs_rq的时间,用于计算时间差
u64			sum_exec_runtime;  //进程总共运行的时间,real-run time  
u64			vruntime;/*存放进程的虚拟运行时间,用于调度器的选择*/
u64			prev_sum_exec_runtime;//进程在醒来的时间  

u64			nr_migrations;

#ifdef CONFIG_SCHEDSTATS
struct sched_statistics statistics;
#endif

#ifdef CONFIG_FAIR_GROUP_SCHED
struct sched_entity	*parent;
/* rq on which this entity is (to be) queued: */
struct cfs_rq		*cfs_rq;
/* rq "owned" by this entity/group: */
struct cfs_rq		*my_q;
#endif
};

对于CFS调度:可知,调度器的实体作为一个se的成员变量潜入在进程描述符task_struct内;
CFS调度具体class为fair_sched_class 

/*
* All the scheduling class methods:
*/
const struct sched_class fair_sched_class = {
.next			= &idle_sched_class,  下一个为idle进程;
.enqueue_task		= enqueue_task_fair,
.dequeue_task		= dequeue_task_fair,
.yield_task		= yield_task_fair,
.yield_to_task		= yield_to_task_fair,

.check_preempt_curr	= check_preempt_wakeup,

.pick_next_task		= pick_next_task_fair,
.put_prev_task		= put_prev_task_fair,

#ifdef CONFIG_SMP
.select_task_rq		= select_task_rq_fair,

.rq_online		= rq_online_fair,
.rq_offline		= rq_offline_fair,

.task_waking		= task_waking_fair,
#endif

.set_curr_task          = set_curr_task_fair,
.task_tick		= task_tick_fair,
.task_fork		= task_fork_fair,

.prio_changed		= prio_changed_fair,
.switched_from		= switched_from_fair,
.switched_to		= switched_to_fair,

.get_rr_interval	= get_rr_interval_fair,

#ifdef CONFIG_FAIR_GROUP_SCHED
.task_move_group	= task_move_group_fair,
#endif
};
由上一章可知:

在进程切换时:deactivate_task---->deactivate_task------>
                         p->sched_class->dequeue_task(rq, p, flags); 如果是CFS调度就会调度 fair_sched_class
中的dequeue_task_fair;

static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se; 根据task_struct中的调度类 调用调度函数,最后调度task_struct的sched_entity;
int task_sleep = flags & DEQUEUE_SLEEP;

for_each_sched_entity(se) {//考虑组调度,
cfs_rq = cfs_rq_of(se); //获取se对应的运行队列
dequeue_entity(cfs_rq, se, flags);

/*
* end evaluation on encountering a throttled cfs_rq
*
* note: in the case of encountering a throttled cfs_rq we will
* post the final h_nr_running decrement below.
*/
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running--;

/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
/*
* Bias pick_next to pick a task from this cfs_rq, as
* p is sleeping when it is within its sched_slice.
*/
if (task_sleep && parent_entity(se))
set_next_buddy(parent_entity(se));

/* avoid re-evaluating load for this entity */
se = parent_entity(se);
break;
}
flags |= DEQUEUE_SLEEP;
}

for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running--;

if (cfs_rq_throttled(cfs_rq))
break;

update_cfs_load(cfs_rq, 0);
update_cfs_shares(cfs_rq);
}

if (!se)
dec_nr_running(rq);
hrtick_update(rq);
}
删除进程:------------>dequeue_entity

static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
/*
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);

update_stats_dequeue(cfs_rq, se);
if (flags & DEQUEUE_SLEEP) {
#ifdef CONFIG_SCHEDSTATS
if (entity_is_task(se)) {
struct task_struct *tsk = task_of(se);

if (tsk->state & TASK_INTERRUPTIBLE)
se->statistics.sleep_start = rq_of(cfs_rq)->clock;
if (tsk->state & TASK_UNINTERRUPTIBLE)
se->statistics.block_start = rq_of(cfs_rq)->clock;
}
#endif
}

clear_buddies(cfs_rq, se);

if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
se->on_rq = 0;
update_cfs_load(cfs_rq, 0);
account_entity_dequeue(cfs_rq, se);

/*
* Normalize the entity after updating the min_vruntime because the
* update can refer to the ->curr item and we need to reflect this
* movement in our normalized position.
*/
if (!(flags & DEQUEUE_SLEEP))
se->vruntime -= cfs_rq->min_vruntime;

/* return excess runtime on last dequeue */
return_cfs_rq_runtime(cfs_rq);

update_min_vruntime(cfs_rq);
update_cfs_shares(cfs_rq);
}


/


/*实现记账功能,由系统定时器周期调用*/
static void update_curr(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
u64 now = rq_of(cfs_rq)->clock;/*now计时器*/
unsigned long delta_exec;

if (unlikely(!curr))
return;

/*
* Get the amount of time the current task was running
* since the last time we changed load (this cannot
* overflow on 32 bits):
*/
/*获得从最后一次修改负载后当前任务所占用的运行总时间*/
/*即计算当前进程的执行时间*/
delta_exec = (unsigned long)(now - curr->exec_start);
if (!delta_exec)/*如果本次没有执行过,不用重新更新了*/
return;
/*根据当前可运行进程总数对运行时间进行加权计算*/
__update_curr(cfs_rq, curr, delta_exec);
curr->exec_start = now;/*将exec_start属性置为now*/

if (entity_is_task(curr)) {/*下面为关于组调度的,暂时不分析了*/
struct task_struct *curtask = task_of(curr);

trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
cpuacct_charge(curtask, delta_exec);
account_group_exec_runtime(curtask, delta_exec);
}
}
</strong></span>
static inline void
__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
unsigned long delta_exec)
{
unsigned long delta_exec_weighted;

schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
/*总运行时间更新,真实运行时间  */
curr->sum_exec_runtime += delta_exec;
/*更新cfs_rq的exec_clock*/
schedstat_add(cfs_rq, exec_clock, delta_exec);
/*用优先级和delta_exec来计算weighted以用于更细vruntime*/
/* calc_delta_fair用来将真实时间转化为虚拟时间。 
        进程的优先级不同,它在系统中的地位(也就是权重)也不同 
        进程的优先级越高,它的虚拟时间走的越慢。*/
delta_exec_weighted = calc_delta_fair(delta_exec, curr);
/*vruntime可以准确地测量给定进程的运行时间
而且可知道谁应该是下一个被运行的进程*/

/*更新进程的虚拟运行时间vruntime*/
curr->vruntime += delta_exec_weighted;
update_min_vruntime(cfs_rq);
}
每个进程在其产生(fork)的时候,都会根据其父亲的优先级产生它的优先级和权重(sched_fork函数)在copy_process函数中调用。
static inline unsigned long
calc_delta_fair(unsigned long delta, struct sched_entity *se)
{
/*NICE_0_LOAD: 优先级0 的weight*/ /如果该进程拥有nice为0的权重,这是他的虚拟时钟和真实时钟是一样速度的
/* 如果不是优先级0,就要调用calc_delta_mine计算delta的weight值*/
if (unlikely(se->load.weight != NICE_0_LOAD))
delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);

return delta;
}


/*在这里不打算详细分析calc_delta_mine (delta_exec,weight,lw),它的执行过程约为delta *= weight / lw.
从这个函数中可以看到,如果进程的优先级为0,那么就是返回delta.
如果不为0,就会调用calc_delta_mine()对delta值进行修正.对上面对calc_delta_mine()的说明来看
,有如下关系: Delta = delta* NICE_0_LOAD/ se->load
Se->load值是怎么来的呢? 可以跟踪sys_nice(),就可以发现se->load
其它就是表示nice对应的load值,nice越低,值越大.
据此,就可以得到一个结论.在执行相同时间的条件下(delta相同),
高优先的进程计算出来的delta值会比低优先级的进程计算出来
的低.应此,高优先的进程就会位于rb_tree的左边,在下次调度的
时候就会优先调度.
从注释来看calc_delta_mine的计算公式为delta *= weight / lw,也就是说进程的权重越大,时钟走的越慢,而且是线性的。
*/
static unsigned long
calc_delta_mine(unsigned long delta_exec, unsigned long weight,
struct load_weight *lw)
{
u64 tmp;

if (!lw->inv_weight) {
if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
lw->inv_weight = 1;
else
lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
/ (lw->weight+1);
}

tmp = (u64)delta_exec * weight;
/*
* Check whether we'd overflow the 64-bit multiplication:
*/
if (unlikely(tmp > WMULT_CONST))
tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
WMULT_SHIFT/2);
else
tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);

return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
}

min_vruntime是cfs的rq中的一个成员,是cfs时间的基准,在cfs中起这至关重要的作用。
自cfs产生以来,这部分的代码改动也是很频繁的。
static void update_min_vruntime(struct cfs_rq *cfs_rq)
{
u64 vruntime = cfs_rq->min_vruntime;

/*
由于当前运行的进程是不在红黑树上的,所以关于cfs_rq->min_vruntime的更新
必须要考虑当前的进程,以免产生不公平,这是以前的调度器所疏忽的。
如果有当前进程,就以当前进程作为基准计算
*/
if (cfs_rq->curr)
vruntime = cfs_rq->curr->vruntime;

if (cfs_rq->rb_leftmost) {
struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
struct sched_entity,
run_node);

if (!cfs_rq->curr)
/*
如果没有当前进程,这个在什么时候出现?
其他策略的进程在运行时?
就不用考虑它了,就是最小的运行时间
*/
vruntime = se->vruntime;
else
/*
如果有当前进程,还要考虑这个最小的运行时间
*/
vruntime = min_vruntime(vruntime, se->vruntime);
}

//最后,更新cfs_rq->min_vruntime,这个值是单调增加的。
cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
}


static void
account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
/*cfs_rq->load更新*/
update_load_sub(&cfs_rq->load, se->load.weight);
if (!parent_entity(se))
dec_cpu_load(rq_of(cfs_rq), se->load.weight);
if (entity_is_task(se)) {/*组调度相关*/
add_cfs_task_weight(cfs_rq, -se->load.weight);
list_del_init(&se->group_node);
}
/*运行个数减一*/
cfs_rq->nr_running--;
se->on_rq = 0;/*表示不再运行队列中*/
}

static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
if (cfs_rq->rb_leftmost == &se->run_node) {
struct rb_node *next_node;

next_node = rb_next(&se->run_node);    从红黑树里面删除
cfs_rq->rb_leftmost = next_node;
}

rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
}


对于向运行队列中添加task:可知最后会调用enqueue_task_fair;

<span style="font-size: 24px;">*
* The enqueue_task method is called before nr_running is
* increased. Here we update the fair scheduling stats and
* then put the task into the rbtree:
*/
static void
enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
 /*对于主调度,会对一个组中的所有进程进行操作*/  
for_each_sched_entity(se) {
if (se->on_rq)
break;
cfs_rq = cfs_rq_of(se);
enqueue_entity(cfs_rq, se, flags);

/*
* end evaluation on encountering a throttled cfs_rq
*
* note: in the case of encountering a throttled cfs_rq we will
* post the final h_nr_running increment below.
*/
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running++;

flags = ENQUEUE_WAKEUP;
}

for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running++;

if (cfs_rq_throttled(cfs_rq))
break;

update_cfs_load(cfs_rq, 0);
update_cfs_shares(cfs_rq);
}

if (!se)
inc_nr_running(rq);
hrtick_update(rq);
}
</span>
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)   更新相关调度信息后最终会调用下面函数插入运行进程的红黑树
{
/*
* Update the normalized vruntime before updating min_vruntime
* through callig update_curr().
*/
if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
se->vruntime += cfs_rq->min_vruntime;

/*
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
update_cfs_load(cfs_rq, 0);
account_entity_enqueue(cfs_rq, se);
update_cfs_shares(cfs_rq);

if (flags & ENQUEUE_WAKEUP) {
place_entity(cfs_rq, se, 0);//进程唤醒时的操作(try_to_wake_up会提到
enqueue_sleeper(cfs_rq, se);
}

update_stats_enqueue(cfs_rq, se);
check_spread(cfs_rq, se);
if (se != cfs_rq->curr)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;

if (cfs_rq->nr_running == 1) {
list_add_leaf_cfs_rq(cfs_rq);
check_enqueue_throttle(cfs_rq);
}
}

/*
* Enqueue an entity into the rb-tree:
*/
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
struct rb_node *parent = NULL;
struct sched_entity *entry;
int leftmost = 1;

/*
* Find the right place in the rbtree:
*/
while (*link) {
parent = *link;
entry = rb_entry(parent, struct sched_entity, run_node);
/*
* We dont care about collisions. Nodes with
* the same key stay together.
*/
if (entity_before(se, entry)) {/*key为被插入进程的vruntime*/ 
link = &parent->rb_left;
} else {
link = &parent->rb_right;
leftmost = 0;
}
}

/*
* Maintain a cache of leftmost tree entries (it is frequently
* used):
*/
if (leftmost)
cfs_rq->rb_leftmost = &se->run_node;

rb_link_node(&se->run_node, parent, link);
rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
}
CFS的运行队列布局是放在红黑树里面的,而这颗红黑树的排序方式是按照运行实体的vruntime来的

CFS调度算法的核心是选择具有最小vruntine的任务。运行队列采用红黑树方式存放,其中节点的键值便是可运行进程的虚拟运行时间。CFS调度器选取待运行的下一个进程,是所有进程中vruntime最小的那个,他对应的便是在树中最左侧的叶子节点。

实现选择的函数为pick_next_task_fair:

static struct task_struct *pick_next_task_fair(struct rq *rq)
{
struct task_struct *p;
struct cfs_rq *cfs_rq = &rq->cfs;
struct sched_entity *se;

if (!cfs_rq->nr_running)
return NULL;

do {
se = pick_next_entity(cfs_rq);
set_next_entity(cfs_rq, se);/*设置为当前运行进程*/ 
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);

p = task_of(se);
if (hrtick_enabled(rq))
hrtick_start_fair(rq, p);

return p;
}
</span>

/*
* Pick the next process, keeping these things in mind, in this order:
* 1) keep things fair between processes/task groups
* 2) pick the "next" process, since someone really wants that to run
* 3) pick the "last" process, for cache locality
* 4) do not run the "skip" process, if something else is available
*/
static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
{
struct sched_entity *se = __pick_first_entity(cfs_rq);
struct sched_entity *left = se;

/*
* Avoid running the skip buddy, if running something else can
* be done without getting too unfair.
*/
if (cfs_rq->skip == se) {
struct sched_entity *second = __pick_next_entity(se);
if (second && wakeup_preempt_entity(second, left) < 1)
se = second;
}

/*
* Prefer last buddy, try to return the CPU to a preempted task.
*/
if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
se = cfs_rq->last;

/*
* Someone really wants this to run. If it's not unfair, run it.
*/
if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
se = cfs_rq->next;

clear_buddies(cfs_rq, se);

return se;
static struct sched_entity *__pick_next_entity(struct sched_entity *se)
{
struct rb_node *next = rb_next(&se->run_node);

if (!next)
return NULL;

return rb_entry(next, struct sched_entity, run_node);
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: