您的位置:首页 > 其它

4.3 时钟虚拟化

2015-07-26 16:28 1681 查看
kvm支持的时钟有8254,local apic timer,kvmclock等,本节将分析8254和kvmclock
4.3.1 8254时钟虚拟化
structkvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
{
a. 建立内核线程
pit->worker_task = kthread_run(kthread_worker_fn,&pit->worker,
"kvm-pit/%d", pid_nr);
b. 准备workqueue
init_kthread_work(&pit->expired,pit_do_work);
c. 始化一个高精准定时器,这个定时器就作为我们虚拟时钟的时钟源
hrtimer_init(&pit_state->timer,CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
d. pit_state->irq_ack_notifier.irq_acked= kvm_pit_ack_irq;时钟中断模拟ack
.kvm_register_irq_mask_notifier(kvm, 0,&pit->mask_notifier);
e. kvm_pit_reset(pit);
f. kvm_iodevice_init(&pit->dev,&pit_dev_ops); //注册io虚拟化操作
ret = kvm_io_bus_register_dev(kvm,KVM_PIO_BUS, KVM_PIT_BASE_ADDRESS,
KVM_PIT_MEM_LENGTH, &pit->dev);
}

当guestos需要通过寄存器操作启动一个时钟控制器时,pit_load_count==》create_pit_timer, 会启动一个hr timer来模拟时钟中断源

hrtimer_cancel(&ps->timer);
flush_kthread_work(&ps->pit->expired);
ps->period = interval;
ps->is_periodic = is_period;
ps->timer.function = pit_timer_fn;
ps->kvm = ps->pit->kvm;
atomic_set(&ps->pending, 0);
ps->irq_ack = 1;
.......
hrtimer_start(&ps->timer,ktime_add_ns(ktime_get(), interval),
HRTIMER_MODE_ABS);

pit_tiemr_fn完成时钟的累加
staticenum hrtimer_restart pit_timer_fn(struct hrtimer *data)
{
struct kvm_kpit_state *ps =container_of(data, struct kvm_kpit_state, timer);
struct kvm_pit *pt =ps->kvm->arch.vpit;

//如果时钟中断需要重新注入,就直接累加;否则那么不进行累加,直接合并时钟中断
if (ps->reinject ||!atomic_read(&ps->pending)) {
atomic_inc(&ps->pending);
queue_kthread_work(&pt->worker,&pt->expired);
}

if (ps->is_periodic) {
// 如果定时器周期触发,则再次启动定时器,否则销毁
hrtimer_add_expires_ns(&ps->timer,ps->period);
return HRTIMER_RESTART;
} else
return HRTIMER_NORESTART;
}

当定时器将时钟中断pending增加,并且添加完工作队列以后,接着就触发下面的时钟中断注入,如果上一个中断被接收,接着触发下一个。代码如下:
static void pit_do_work(struct kthread_work *work)
{
.......
spin_lock(&ps->inject_lock);
if (ps->irq_ack){
ps->irq_ack= 0;
inject = 1;
}
spin_unlock(&ps->inject_lock);
if (inject) {
///*模拟一个高电平和一个低电平,发送给PIC,触发时钟中断。*/
kvm_set_irq(kvm,kvm->arch.vpit->irq_source_id, 0, 1, false);
kvm_set_irq(kvm,kvm->arch.vpit->irq_source_id, 0, 0, false);

//nmi watchdog support

if(kvm->arch.vapics_in_nmi_mode > 0)
kvm_for_each_vcpu(i,vcpu, kvm)
kvm_apic_nmi_wd_deliver(vcpu);
}
}

kvm_pit_ack_irq实现中断的ack应答虚拟化
staticvoid kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
{
struct kvm_kpit_state *ps =container_of(kian, struct kvm_kpit_state,
irq_ack_notifier);
int value;

spin_lock(&ps->inject_lock);
value =atomic_dec_return(&ps->pending); //注入成功,则中断累加器减一
if (value < 0) // 异常情况,pending本来就是0还减少,说明是无效的ack
atomic_inc(&ps->pending);
else if (value > 0) / > 0还需要重新注入积累的中断
queue_kthread_work(&ps->pit->worker,&ps->pit->expired);
ps->irq_ack = 1;//设置ack位
spin_unlock(&ps->inject_lock);
}

4.3.2 kvmclock时钟虚拟化
时间虚拟化的一种实现方式是通过时钟中断计数,进而换算得到,这种方式在虚拟机里存在问题,因为有时运行vpcu的cpu被调度出来使 时钟中断不能准时到达guest os。另外一种方式,如模拟HPET,guest
os当需要的时候会去读当前的时间,这种方式会使得虚拟机频繁的VM-exit,影响性能。为此kvm引入了基于半虚拟化的时钟kvmclock,这种方式需要在guest上实现一个kvmclock驱动, 建立guest os 到VMM的通道, 这样通过这个通道guest os 向vmm 查询时间。

(1) guest os kvmclock 驱动
源码路径: arch\x86\kernel\ kvmclock.cpvclock.c.
kvmclock_init ==>
a. kvm_register_clock
src =&hv_clock[cpu].pvti;
low =(int)slow_virt_to_phys(src) | 1;
high =((u64)slow_virt_to_phys(src) >> 32);
ret =native_write_msr_safe(msr_kvm_system_time, low, high);
通过msr寄存write的方式将hv_clock[cpu].pvti的gpa通知给vmm.
b. 写改x86的函数指针
pv_time_ops.sched_clock = kvm_clock_read;
x86_platform.calibrate_tsc =kvm_get_tsc_khz;
x86_platform.get_wallclock =kvm_get_wallclock;
x86_platform.set_wallclock = kvm_set_wallclock;
例如x86_platform.get_wallclock 默认为mach_get_cmos_time(从cmos取得wallclock).
wallclock指的是操作系统从开机开始的绝对时间。
c. clocksource_register_hz(&kvm_clock,NSEC_PER_SEC); //注册系统时钟源
static struct clocksource kvm_clock = {
.name ="kvm-clock",
.read = kvm_clock_get_cycles,
.rating = 400, //rating400为理想时钟源
.mask =CLOCKSOURCE_MASK(64),
.flags =CLOCK_SOURCE_IS_CONTINUOUS,
};
由于kvm-clock将rating 设为400,这样会使clocksource_register_hz==》__clocksource_register_scale==》 clocksource_select==》__clocksource_select将guest os 的curr_clocksource设为kvmclock

下面重点分析kvm_clock_read和kvm_get_wallclock
static void kvm_get_wallclock(struct timespec *now)
{
structpvclock_vcpu_time_info *vcpu_time;
int low, high;
int cpu;
low =(int)__pa_symbol(&wall_clock);
high = ((u64)__pa_symbol(&wall_clock)>> 32);
native_write_msr(msr_kvm_wall_clock,low, high);
preempt_disable();
cpu =smp_processor_id();
vcpu_time =&hv_clock[cpu].pvti;
pvclock_read_wallclock(&wall_clock,vcpu_time, now);
preempt_enable();
}
a. native_write_msr(msr_kvm_wall_clock, low, high);通知vmm要取wall_clock并将wall_clock的gpa告诉vmm.

b. pvclock_read_wallclock 返回vmm设置号的wallclock. wall_clock在返回前相当于是guest 与vmm间的共享内存.

pvclock_read_wallclock的访问
void pvclock_read_wallclock(struct pvclock_wall_clock*wall_clock,
{

//等待vmm设置好wall_clock, 用version来标记数据是否更新
do {
version =wall_clock->version;
rmb(); /* fetch version before time */
now.tv_sec = wall_clock->sec;
now.tv_nsec =wall_clock->nsec;
rmb(); /* fetch time before checking version */
} while ((wall_clock->version& 1) || (version != wall_clock->version)); /
//这时wall_clock记录的是系统开机时的时间

//取得系统运行的时间, vcpu_time作为共享内存,其地址在kvm_register_clock通知了vmm
delta =pvclock_clocksource_read(vcpu_time);

//两者相加为wall_clock,
delta += now.tv_sec *(u64)NSEC_PER_SEC + now.tv_nsec;

now.tv_nsec =do_div(delta, NSEC_PER_SEC);
now.tv_sec = delta;

set_normalized_timespec(ts,now.tv_sec, now.tv_nsec);
}

static cycle_t kvm_clock_read(void) ==> pvclock_clocksource_read
cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info*src)
{

do {
version =__pvclock_read_cycles(src, &ret, &flags); // __native_read_tsc
} while((src->version & 1) || version != src->version);

......
if ((valid_flags &PVCLOCK_TSC_STABLE_BIT) &&
(flags & PVCLOCK_TSC_STABLE_BIT))
return ret;
last =atomic64_read(&last_value);
do {
if (ret < last)
return last;
last =atomic64_cmpxchg(&last_value, last, ret);
} while (unlikely(last!= ret));

return ret;
}

(2) VMM kvmclock实现
msr的实现:
kvm_set_msr_common ==> case MSR_KVM_WALL_CLOCK ==> kvm_write_wall_clock
static void kvm_write_wall_clock(struct kvm *kvm, gpa_twall_clock)
{
.......
//a. 读guest version
r = kvm_read_guest(kvm,wall_clock, &version, sizeof(version));
if (r)
return;

if (version & 1)
++version; /* first time write, random junk */

++version;

kvm_write_guest(kvm,wall_clock, &version, sizeof(version));//更新version

getboottime(&boot);//得到系统的boot时间

if(kvm->arch.kvmclock_offset) {
struct timespec ts =ns_to_timespec(kvm->arch.kvmclock_offset);
boot =timespec_sub(boot, ts);
}
wc.sec = boot.tv_sec;
wc.nsec = boot.tv_nsec;
wc.version = version;

kvm_write_guest(kvm,wall_clock, &wc, sizeof(wc)); //更新guest wall_clock

version++;
kvm_write_guest(kvm,wall_clock, &version, sizeof(version)); //更新version,完成通讯
}

kvm_read_guest/kvm_write_guest 的工作原理是通过gpa得到对应page 的hva和页内偏移,然后就能读写内存了
int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data,unsigned long len)
{
gfn_t gfn = gpa>> PAGE_SHIFT;
int seg;
int offset =offset_in_page(gpa);
int ret;

while ((seg =next_segment(len, offset)) != 0) {
ret =kvm_read_guest_page(kvm, gfn, data, offset, seg);
if (ret < 0)
return ret;
offset = 0;
len -= seg;
data += seg;
++gfn;
}
return 0;
}
int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data,int offset,
int len)
{
int r;
unsigned long addr;

addr =gfn_to_hva_prot(kvm, gfn, NULL);
if(kvm_is_error_hva(addr))
return -EFAULT;
r = kvm_read_hva(data,(void __user *)addr + offset, len); // call __copy_from_user
if (r)
return -EFAULT;
return 0;
}

kvm_set_msr_common ==> case MSR_KVM_SYSTEM_TIME
a. kvmclock_reset //vcpu->arch.pv_time_enabled = false
b. kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE,vcpu);
c. if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
&vcpu->arch.pv_time, data &~1ULL,
sizeof(struct pvclock_vcpu_time_info)))
vcpu->arch.pv_time_enabled= false;
else
vcpu->arch.pv_time_enabled= true;
kvm_gfn_to_hva_cache_init会得到guest os 的hv_clock[cpu].pvti

vcpu_enter_guest==> KVM_REQ_GLOBAL_CLOCK_UPDATE kvm_gen_kvmclock_update(vcpu);
set_bit(KVM_REQ_CLOCK_UPDATE,&v->requests);
schedule_delayed_work(&kvm->arch.kvmclock_update_work,
KVMCLOCK_UPDATE_DELAY);

由于在kvm_arch_init_vm时:
INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work,kvmclock_update_fn);
INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work,kvmclock_sync_fn);

所以kvm->arch.kvmclock_update_work==》
static void kvmclock_update_fn(struct work_struct *work)
{
。。。。。。
//对每个vcpu设置KVM_REQ_CLOCK_UPDATE
kvm_for_each_vcpu(i,vcpu, kvm) {
set_bit(KVM_REQ_CLOCK_UPDATE,&vcpu->requests);
kvm_vcpu_kick(vcpu);
}
}

vcpu_enter_guest==>KVM_REQ_CLOCK_UPDATE kvm_guest_time_update(vcpu);
kvm_guest_time_update会将时间更新到vcpu->pv_time

4.3.3 Cpu Steal time
Cpu Steal time指的是vcpu 等待 real cpu 的时间, 因为vcpu会发生vm-exit而进入vmm;进入vmm 后到重新vm-entry的时间就是一次cpu steal time. 该指标是衡量vm性能的重要指标。 通过半虚拟化技术guest os能得到cpu steal time. VMM与guest通讯机制与上一节类似,本节就不讨论了。

(1) Guest os 实现
1. kvm_guest_init注册函数指针pv_time_ops.steal_clock =kvm_steal_clock; 对非guest而言
该函数为native_steal_clock, 直接返回0

2. Guest os 通过kvm_register_steal_time 通知vmm 共享内存地址:
wrmsrl(MSR_KVM_STEAL_TIME,(slow_virt_to_phys(st) | KVM_MSR_ENABLED));

内核kernel\core.c update_rq_clock ==> update_rq_clock_task ==>
paravirt_steal_clock(cpu_of(rq))==> pv_time_ops.steal_clock;

(2) vmm 实现
kvm_set_msr_common ==》 case MSR_KVM_STEAL_TIME
a. kvm_gfn_to_hva_cache_init得到guest os gpa -> hva
b. vcpu->arch.st.last_steal= current->sched_info.run_delay;
c. accumulate_steal_time(vcpu);
static void accumulate_steal_time(struct kvm_vcpu *vcpu)
{
.......
delta =current->sched_info.run_delay - vcpu->arch.st.last_steal;
vcpu->arch.st.last_steal= current->sched_info.run_delay;
vcpu->arch.st.accum_steal= delta;
}
第一调用时delta会为0, 但当以后vcpu_load时kvm_arch_vcpu_load会重新调用accumulate_steal_time

d. kvm_make_request(KVM_REQ_STEAL_UPDATE,vcpu);

vcpu_enter_guest ==> record_steal_time(vcpu);
static void record_steal_time(struct kvm_vcpu *vcpu)
{

............ //kvm_read_guest_cached
vcpu->arch.st.steal.steal+= vcpu->arch.st.accum_steal;
vcpu->arch.st.steal.version+= 2;
vcpu->arch.st.accum_steal= 0;
......... //kvm_write_guest_cached
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: