您的位置:首页 > 运维架构 > Linux

linux kernel scheduler -- 进程优先级

2015-07-17 15:03 836 查看

1. task_struct 中标示linux 进程优先级的几个重要变量

linux kernel 进程描述符task_struct中有几个成员标识 linux 进程的优先级,prio, static_prio, normal_prio,rt_priority。

他们之间究竟是什么关系,究竟什么样的优先级值才能最快能被调度器调度执行?

详细解读之前,可以先明确上面的一个问题,task_struct 中的成员变量 prio越小,进程的优先级越高。prio 值的取值范围为0..139。
上述描述在include/linux/sched.h 中也有描述,

/*
* Priority of a process goes from 0..MAX_PRIO-1, valid RT
* priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
* tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
* values are inverted: lower p->prio value means higher priority.
*
* The MAX_USER_RT_PRIO value allows the actual maximum
* RT priority to be separate from the value exported to
* user-space.  This allows kernel threads to set their
* priority to a value higher than any user task. Note:
* MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
*/

#define MAX_USER_RT_PRIO	100
#define MAX_RT_PRIO		MAX_USER_RT_PRIO

#define MAX_PRIO		(MAX_RT_PRIO + 40)
#define DEFAULT_PRIO		(MAX_RT_PRIO + 20)


需要提到调度器不同的调度策略,后两种是预留的还没有实现的,不用管。SCHED_FIFO/SCHED_RR 为实时进程调度策略,SCHED_NORMAL/SCHED_BATCH 为非实时进程也就是普通进程的调度策略。

/*
* Scheduling policies
*/
#define SCHED_NORMAL		0
#define SCHED_FIFO		1
#define SCHED_RR		2
#define SCHED_BATCH		3
/* SCHED_ISO: reserved but not implemented yet */
#define SCHED_IDLE		5
/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
#define SCHED_RESET_ON_FORK     0x40000000


其中,实时进程(sched policy 为)的优先级取值范围是0..99,非实时进程的取值范围为100..139。

如果问题就到这里结束也许觉得也还挺清晰,但是这只是开始,问题来源于linux kernel 中设置调度器的方式似乎正在颠覆你刚刚有的这点认识。

接下来会一一描述,但是有一条请绝对坚信并牢记,
task_struct 中的成员变量 prio越小,进程的优先级越高。prio 值的取值范围为0..139。

2.设置进程的调度策略和进程优先级

下面以linux kernel watchdog 进程作为例子,kernel/watchdog.c

为每个cpu 创建 watchdog/N 进程,watchdog 进程的循环体在 watchdog()函数。

static int watchdog_enable(int cpu)
{
struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
int err = 0;

/* enable the perf event */
err = watchdog_nmi_enable(cpu);

/* Regardless of err above, fall through and start softlockup */

/* create the watchdog thread */
if (!p) {
struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu);
设置 watchdog/N 进程的调度策略和进程优先级,sched policy 很明确SCHED_FIFO,为RT 实时进程。
.sched_priority 被设置成了MAX_RT_PRIO-1 也就是99了,这是个什么优先级的进程呢? 别急,且看代码且分析。
//struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
sched_setscheduler(p, SCHED_FIFO, ¶m);


sched_setscheduler()

-> __sched_setscheduler()

-> __setscheduler()

在__sched_setscheduler() 函数中有这么一段代码,向我们描述了一个很重要的事实,想用sched_setscheduler()设置进程优先级,

必须满足: 实时进程有效优先级为1..99,非实时进程的优先级为0。 这里你可能觉得我是在开玩笑,非实时进程的有效优先级怎么可能为0 ?

请注意,这里指的是内核通过sched_setscheduler() 接口设置的优先级,在后面你就会看到非实时进程的优先级我们有个初始值(fork/init),

你只能通过内核提供的nice 设置函数,间接修改非实时进程的优先级。

/*
* Valid priorities for SCHED_FIFO and SCHED_RR are
* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
* SCHED_BATCH and SCHED_IDLE is 0.
*/
if (param->sched_priority < 0 ||
(p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
(!p->mm && param->sched_priority > MAX_RT_PRIO-1))
return -EINVAL;
if (rt_policy(policy) != (param->sched_priority != 0))
return -EINVAL;


后来调到__setscheduler() 真正进行进程优先级设置,

/* Actually do priority change: must hold rq lock. */
static void
__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
{
p->policy = policy;// 你通过sched_setscheduler()传入的sched policy就是最终赋予给(task_struct*)p->policy
p->rt_priority = prio;//你通过sched_setscheduler()传入的prio 最终却赋予给了(task_struct*)p->rt_priority,怎么样? 没忽悠你吧,传入的非实时进程优先级值真的只能是0!!!
p->normal_prio = normal_prio(p);//接下来看 p->normal_prio 是怎么回事?
/* we are holding p->pi_lock already */
p->prio = rt_mutex_getprio(p);
if (rt_prio(p->prio))
p->sched_class = &rt_sched_class;
else
p->sched_class = &fair_sched_class;
set_load_weight(p);
}


看下normal_prio()

/*
* Calculate the expected normal priority: i.e. priority
* without taking RT-inheritance into account. Might be
* boosted by interactivity modifiers. Changes upon fork,
* setprio syscalls, and whenever the interactivity
* estimator recalculates.
*/
static inline int normal_prio(struct task_struct *p)
{
int prio;

if (task_has_rt_policy(p))
prio = MAX_RT_PRIO-1 - p->rt_priority;//如果你是实时进程,p->prio = 99 - 你通过sched_setscheduler()传入的优先级值,这下明白了吧,watchdog 进程通过sched_setscheduler() 设置的优先级值为99, 这样一算之后真正 p->prio 为0, 也就是linux kernel 中最高优先级的实时进程!
else
prio = __normal_prio(p);// 非实时进程的优先级,可真费劲,接着看?
return prio;
}


看下__normal_prio(),

/*
* __normal_prio - return the priority that is based on the static prio
*/
static inline int __normal_prio(struct task_struct *p)
{
return p->static_prio;//逗我呢吧,非实时进程的优先级通过sched_setscheduler()设置的值并没有赋予p->prio,而是直接取的p->static 的值
p->static 的值从哪来的呢?
}


遍历整个kernel 代码树,只有三个地方会对p->static进行赋值,

--------------------------

void sched_fork(struct task_struct *p)
{
/*
* Revert to default priority/policy on fork if requested.
*/
if (unlikely(p->sched_reset_on_fork)) {
if (task_has_rt_policy(p)) {
p->policy = SCHED_NORMAL;
p->static_prio = NICE_TO_PRIO(0);
p->rt_priority = 0;
} else if (PRIO_TO_NICE(p->static_prio) < 0)
p->static_prio = NICE_TO_PRIO(0);

--------------------------

void set_user_nice(struct task_struct *p, long nice)
{

/*
* The RT priorities are set via sched_setscheduler(), but we still
* allow the 'normal' nice value to be set - but as expected
* it wont have any effect on scheduling until the task is
* SCHED_FIFO/SCHED_RR:
*/
if (task_has_rt_policy(p)) {
p->static_prio = NICE_TO_PRIO(nice);
goto out_unlock;
}
on_rq = p->on_rq;
if (on_rq)
dequeue_task(rq, p, 0);

p->static_prio = NICE_TO_PRIO(nice);
--------------------------

/*
*  INIT_TASK is used to set up the first task table, touch at
* your own risk!. Base=0, limit=0x1fffff (=2MB)
*/
#define INIT_TASK(tsk)	\
{									\
.state		= 0,						\
.stack		= &init_thread_info,				\
.usage		= ATOMIC_INIT(2),				\
.flags		= PF_KTHREAD,					\
.prio		= MAX_PRIO-20,					\
.static_prio	= MAX_PRIO-20,


明白了吧? p->static 的值,也就是非实时进程的优先级只能通过fork 或是设置nice 值间接修改。

3.查看进程优先级

接下来,你可能最关心的问题就是如何查看一个进程的优先级呢?

ps 提供这个选项,Android ps 的实现不同于ubuntu,看下system/core/toolbox/ps.c,源代码告诉我们选项足够了,调度策略,优先级,RT优先级都能看,

ps -p -P

看下吧:PRIO 标识进程的优先级,怎么样? 有没有世界观又一次被颠覆的感觉? 你不是告诉我进程优先级取值范围是 0..139吗???

别急, 看下ps 到底是怎么得到进程的优先级的。

ps 是通过 cat /proc/<pid>/stat 获取到的值,

对应kernel 里面的实现在 fs/proc/base.c 中,

static const struct pid_entry tgid_base_stuff[] = {

INF("cmdline",    S_IRUGO, proc_pid_cmdline),
ONE("stat",       S_IRUGO, proc_tgid_stat),


看下priority 是通过task_prio()获取的,

static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task, int whole)
{

/* scale priority and nice values from timeslices to -20..20 */
/* to make it look like a "normal" Unix priority/nice value  */
priority = task_prio(task);
nice = task_nice(task);


看下task_prio()

/**
* task_prio - return the priority value of a given task.
* @p: the task in question.
*
* This is the priority value as seen by users in /proc.
* RT tasks are offset by -200. Normal tasks are centered
* around 0, value goes from -16 to +15.
*/
int task_prio(const struct task_struct *p)
{
return p->prio - MAX_RT_PRIO;//ps 命令得到的priority 值是 p->prio - 100 得到的!!!
}


赶紧找到你熟悉的进程看看,有没有突然又相信本文开始提醒你坚持的世界观:

task_struct 中的成员变量 prio越小,进程的优先级越高。prio 值的取值范围为0..139。

USER     PID   PPID  VSIZE  RSS   PRIO  NICE  RTPRI SCHED  PCY  WCHAN    PC        NAME

root      1     0     932    716   20    0     0     0     fg  c0179270 00029eb4 S /init

root      2     0     0      0     -2    0     1     1     fg  c00ba720 00000000 S kthreadd

root      3     2     0      0     20    0     0     0     fg  c00a1e00 00000000 S ksoftirqd/0

root      6     2     0      0     20    0     0     0     fg  c0081080 00000000 D kworker/u:0

root      7     2     0      0     0     -20   0     0     fg  c007e2e0 00000000 D kworker/u:0H

root      8     2     0      0     -100  0     99    1     fg  c00f494c 00000000 S migration/0

root      21    2     0      0     0     -20   0     0     fg  c00b51b0 00000000 S khelper

root      22    2     0      0     0     -20   0     0     fg  c00b51b0 00000000 S netns

root      27    2     0      0     0     -20   0     0     fg  c00b60a8 00000000 S kworker/0:1H

root      28    2     0      0     0     -20   0     0     fg  c00b51b0 00000000 S modem_notifier

root      29    2     0      0     0     -20   0     0     fg  c00b51b0 00000000 S smd_channel_clo

root      30    2     0      0     0     -20   0     0     fg  c00b51b0 00000000 S smsm_cb_wq

root      32    2     0      0     0     -20   0     0     fg  c00b51b0 00000000 S rpm-smd

root      33    2     0      0     0     -20   0     0     fg  c00b60a8 00000000 S kworker/u:1H

root      50    2     0      0     -51   0     50    1     fg  c0100ac8 00000000 S irq/47-cpr

root      51    2     0      0     0     -20   0     0     fg  c00b51b0 00000000 S mpm

root      52    2     0      0     20    0     0     0     fg  c0144d74 00000000 S sync_supers

root      53    2     0      0     20    0     0     0     fg  c0145d9c 00000000 S bdi-default

root      54    2     0      0     0     -20   0     0     fg  c00b51b0 00000000 S kblockd

root      55    2     0      0     20    0     0     5     fg  c03599c8 00000000 S system

root      56    2     0      0     20    0     0     0     fg  c0414830 00000000 S khubd

root      57    2     0      0     -51   0     50    1     fg  c0100ac8 00000000 S irq/102-msm_iom
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: