您的位置:首页 > 其它

softlockup检测(watchdog)原理(用于检测系统调度是否正常)

2017-09-05 10:34 417 查看
softlockup(watchdog)用于检测系统调度是否正常,即软锁的情况,当发生softlockup时,内核不能调度,但还能响应中断,对用户的表现可能为:能ping通,但无法登陆系统,无法进行正常操作。
其基本原理为:为每个CPU启动一个内核线程(watchdog/x),此线程为优先级最高的实时线程,在该线程得到调度时,会更新相应的计数(时间戳),同时会启动定时器,当定时器到期时检查相应的时间戳,如果超过指定时间,都没有更新,则说明这段时间内都没有发生调度(因为此线程优先级最高),则打印相应告警或根据配置可以进入panic流程。
基本代码分析(2.6.32)
rest_init->kernel_init->lockup_detector_init->cpu_callback->watchdog_prepare_cpu(初始化watchdog定时器):

点击(此处)折叠或打开
static int watchdog_prepare_cpu(int cpu)

{

struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);

WARN_ON(per_cpu(softlockup_watchdog, cpu));

hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);//初始化高精度定时器

hrtimer->function = watchdog_timer_fn;//设置定时器处理函数

return 0;

}

看门狗定时器处理函数:

点击(此处)折叠或打开
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)

{

//获取计数watchdog_touch_ts,该计数在watchdog内核线程被调度时更新

unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts);

struct pt_regs *regs = get_irq_regs();

int duration;

/* kick the hardlockup detector */

//增加中断计数,证明没有发生硬锁(关中断死锁)

watchdog_interrupt_count();

/* kick the softlockup detector */

//唤醒wathdog内核线程

wake_up_process(__get_cpu_var(softlockup_watchdog));

/* .. and repeat */

//重启定时器

hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period()));

if (touch_ts == 0) {

if (unlikely(__get_cpu_var(softlockup_touch_sync))) {

/*

* If the time stamp was touched atomically

* make sure the scheduler tick is up to date.

*/

__get_cpu_var(softlockup_touch_sync) = false;

sched_clock_tick();

}

__touch_watchdog();

return HRTIMER_RESTART;

}

/* check for a softlockup

* This is done by making sure a high priority task is

* being scheduled. The task touches the watchdog to

* indicate it is getting cpu time. If it hasn't then

* this is a good indication some task is hogging the cpu

*/

//判断是否发生了软锁,原理是判断touch_ts(时间戳)是否超过一定时间没有更新

duration = is_softlockup(touch_ts);

if (unlikely(duration)) {

/* only warn once */

if (__get_cpu_var(soft_watchdog_warn) == true)

return HRTIMER_RESTART;

//发生了软锁后,进行一些列的信息记录和告警。

printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",

smp_processor_id(), duration,

current->comm, task_pid_nr(current));

print_modules();

print_irqtrace_events(current);

if (regs)

show_regs(regs);

else

dump_stack();

//如果配置了softlockup_panic(proc中配置),则panic

if (softlockup_panic)

panic("softlockup: hung tasks");

__get_cpu_var(soft_watchdog_warn) = true;

} else

__get_cpu_var(soft_watchdog_warn) = false;

return HRTIMER_RESTART;

}

启动看门狗,即创建watchdog内核线程。

点击(此处)折叠或打开
static int watchdog_enable(int cpu)

{

struct task_struct *p = per_cpu(softlockup_watchdog, cpu);

int err = 0;

/* enable the perf event */

err = watchdog_nmi_enable(cpu);

/* Regardless of err above, fall through and start softlockup */

/* create the watchdog thread */

if (!p) {

//创建watchdog内核线程

p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);

if (IS_ERR(p)) {

printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);

if (!err)

/* if hardlockup hasn't already set this */

err = PTR_ERR(p);

goto out;

}

kthread_bind(p, cpu);

per_cpu(watchdog_touch_ts, cpu) = 0;

per_cpu(softlockup_watchdog, cpu) = p;

wake_up_process(p);

}

out:

return err;

}

watchdog内核线程执行主函数,主要是要更新计数(时间戳)

点击(此处)折叠或打开
static int watchdog(void *unused)

{

//设置为最高优先级

struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };

struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);

//设置为实时线程

sched_setscheduler(current, SCHED_FIFO, ¶m);

/* initialize timestamp */

//初始化计数(时间戳)

__touch_watchdog();

/* kick off the timer for the hardlockup detector */

/* done here because hrtimer_start can only pin to smp_processor_id() */

//启动定时器,用于检测是否发生软锁

hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()),

HRTIMER_MODE_REL_PINNED);

//睡眠

set_current_state(TASK_INTERRUPTIBLE);

/*

* Run briefly once per second to reset the softlockup timestamp.

* If this gets delayed for more than 60 seconds then the

* debug-printout triggers in watchdog_timer_fn().

*/

while (!kthread_should_stop()) {

//更新计数

__touch_watchdog();

schedule();

if (kthread_should_stop())

break;

set_current_state(TASK_INTERRUPTIBLE);

}

__set_current_state(TASK_RUNNING);

return 0;

}

判断是否发生软锁:is_softlockup

点击(此处)折叠或打开
static int is_softlockup(unsigned long touch_ts)

{

unsigned long now = get_timestamp(smp_processor_id());

/* Warn about unreasonable delays: */

//检测计数多久没有更新了,如果超过了60s,则表示发生了软锁

if (time_after(now, touch_ts + softlockup_thresh))

return now - touch_ts;

return 0;

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  watchdog