今天主要阐述VCPU的执行过程,在整个cpu虚拟化过程中,VCPU的运行是核心,包括了vm entry和vm exit两个过程。在这两个过程中,又牵扯到虚拟cpu和物理cpu的切换过程。由于本文是以intel CPU虚拟化为例,因此涉及到的代码都会以VMX为主。
先来看下VCPU运行的代码开始部分,这部分代码主要在回调函数kvm_vcpu_thread_fn中。
static void *kvm_vcpu_thread_fn(void *arg)
{
CPUState *cpu = arg;
int r;
rcu_register_thread();
qemu_mutex_lock_iothread();
qemu_thread_get_self(cpu->thread);
cpu->thread_id = qemu_get_thread_id();
cpu->can_do_io = 1;
current_cpu = cpu;
r = kvm_init_vcpu(cpu, &error_fatal);
kvm_init_cpu_signals(cpu);
/* signal CPU creation */
cpu_thread_signal_created(cpu);
qemu_guest_random_seed_thread_part2(cpu->random_seed);
do {
if (cpu_can_run(cpu)) {
r = kvm_cpu_exec(cpu);
cpu_handle_guest_debug(cpu);
}
}
qemu_wait_io_event(cpu);
} while (!cpu->unplug || cpu_can_run(cpu));
kvm_destroy_vcpu(cpu);
cpu_thread_signal_destroyed(cpu);
qemu_mutex_unlock_iothread();
rcu_unregister_thread();
return NULL;
}
关于kvm_vcpu_thread_fn函数,在前篇文章都有阐述,主要是虚拟机初始化过程中(qemu 启动)创建VCPU时(主要是创建qemu线程)执行的回调函数。在该函数的内部有for循环,执行虚拟机的代码。从代码可以看到主要是kvm_cpu_exec函数。
int kvm_cpu_exec(CPUState *cpu)
{
struct kvm_run *run = cpu->kvm_run;
int ret, run_ret;
DPRINTF("kvm_cpu_exec()\n");
if (kvm_arch_process_async_events(cpu)) {
qatomic_set(&cpu->exit_request, 0);
return EXCP_HLT;
}
qemu_mutex_unlock_iothread();
cpu_exec_start(cpu);
do {
.......
kvm_arch_pre_run(cpu, run);
run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0);
attrs = kvm_arch_post_run(cpu, run);
switch (run->exit_reason) {
case KVM_EXIT_IO:
DPRINTF("handle_io\n");
/* Called outside BQL */
kvm_handle_io(run->io.port, attrs,
(uint8_t *)run + run->io.data_offset,
run->io.direction,
run->io.size,
run->io.count);
ret = 0;
break;
case KVM_EXIT_MMIO:
DPRINTF("handle_mmio\n");
/* Called outside BQL */
address_space_rw(&address_space_memory,
run->mmio.phys_addr, attrs,
run->mmio.data,
run->mmio.len,
run->mmio.is_write);
ret = 0;
break;
case KVM_EXIT_IRQ_WINDOW_OPEN:
.......
default:
DPRINTF("kvm_arch_handle_exit\n");
ret = kvm_arch_handle_exit(cpu, run);
break;
}
} while (ret == 0);
......
}
kvm_arch_pre_run函数首先做一些运行前的准备工作,主要是nmi和smi的中断注入,之后触发VCPU的系统调用ioctl(KVM_RUN)让cpu运行起来,KVM模块收到ioctl(KVM_RUN)指令之后会执行VMX指令,把该VCPU的运行的物理CPU从VMX ROOT模式切换到VMX non-root模式,开始运行虚拟机的代码。虚拟机内部如果遇到一些事件产生VM exit,就会退出到KVM,如果KVM无法处理就会分发到QEMU,也就是在ioctl(KVM_RUN)返回的时候调用kvm_arch_post_run来进行一些初步处理,然后开始根据QEMU与KVM共享内存kvm_run中的数据来判断退出原因,并作出相应处理。
这里给出QEMU/KVM/以及虚拟机之间的关系图,如下图所示:
接下来分析ioctl(KVM_RUN)在内核中的运行过程,对应的处理函数是kvm_arch_vcpu_ioctl_run,该函数主要调用vcpu_run,代码如下:
static int vcpu_run(struct kvm_vcpu *vcpu)
{
int r;
struct kvm *kvm = vcpu->kvm;
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
vcpu->arch.l1tf_flush_l1d = true;
for (;;) {
if (kvm_vcpu_running(vcpu)) {
r = vcpu_enter_guest(vcpu);
} else {
r = vcpu_block(kvm, vcpu);
}
if (r <= 0)
break;
kvm_clear_request(KVM_REQ_PENDING_TIMER, vcpu);
if (kvm_cpu_has_pending_timer(vcpu))
kvm_inject_pending_timer_irqs(vcpu);
if (dm_request_for_irq_injection(vcpu) &&
kvm_vcpu_ready_for_interrupt_injection(vcpu)) {
r = 0;
vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
++vcpu->stat.request_irq_exits;
break;
}
kvm_check_async_pf_completion(vcpu);
if (signal_pending(current)) {
r = -EINTR;
vcpu->run->exit_reason = KVM_EXIT_INTR;
++vcpu->stat.signal_exits;
break;
}
if (need_resched()) {
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
cond_resched();
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
}
}
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
return r;
}
vcpu_run函数主要也是一个循环结构。首先调用kvm_vcpu_running判断当前CPU是否可以运行,代码如下:
static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
{
if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
kvm_x86_ops->check_nested_events(vcpu, false);
return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
!vcpu->arch.apf.halted);
}
判断主要是2个方面,1是vcpu.arch结构的mp_state是否是KVM_MP_STATE_RUNNABLE,2是vcpu.arch结构中的apf.halted表示的虚拟机中是否存在需要访问却被宿主机swap出去的内存页,如果由于apf而被暂停,则这个时候虚拟机CPU也是不能运行的。
如果判断结果是可运行的,则会调用vcpu_enter_guest来进入虚拟机,在vcpu_enter_guest的最开始,该函数会对vcpu->requests上的请求进行处理,这个requests为unsigned longleixing ,每一位代表一个请求。
vcpu_enter_guest接下来处理虚拟中断请求,调用kvm_mmu_reload重启加载虚拟机cpu对应的内存。在禁止抢占之后调用回调函数prepare_guest_switch,vmx对应的函数是vmx_save_host_state,主要是保存host的状态,通过vmcs_writel将宿主机的状态保存在VMCS区域中,使虚拟机退出之后能够正常运行。
紧接着调用vmx的run函数,对应函数是vmx_vcpu_run。该函数首先根据VCPU的状态写一些VMCS的值,接着执行汇编ASM_VMX_VMLAUNCH(汇编代码在文件 arch/x86/kvm/vmx/vmenter.S中,本文不贴具体代码了)将CPU置于guest模式,这个时候cpu就开始执行虚拟机的代码,当发生退出时候,其地址是vmx_return,这个值是在vmx_vcpu_setup调用的vmx_set_constant_host_state函数中设置的。
static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
......
.run = vmx_vcpu_run,
.handle_exit = vmx_handle_exit,
......
}
vmx_vcpu_run接着调用vmcs_read32读出虚拟机退出的原因,保存在vcpu_vmx结构体的exit_reason成员中。vmx_vcpu_run调用3个函数对本次退出进行预处理,分别是vmx_complete_atomic_exit(vmx)、vmx_recover_nmi_blocking(vmx)和vmx_complete_interrrupts(vmx)。
vmx_vcpu_run代码如下:
static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long cr3, cr4;
/* Record the guest's net vcpu time for enforced NMI injections. */
if (unlikely(!enable_vnmi &&
vmx->loaded_vmcs->soft_vnmi_blocked))
vmx->loaded_vmcs->entry_time = ktime_get();
......
cr4 = cr4_read_shadow();
if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
vmcs_writel(HOST_CR4, cr4);
vmx->loaded_vmcs->host_state.cr4 = cr4;
}
........
vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON);
if ((u16)vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
kvm_machine_check();
if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
return;
vmx->loaded_vmcs->launched = 1;
vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
vmx_recover_nmi_blocking(vmx);
vmx_complete_interrupts(vmx);
}
回到vcpu_enter_guest,当虚拟机退出之后会调用vmx实现的handle_exit_irqoff回调来处理外部中断,并调用handle_exit回调来处理各种退出事件。先看handle_exit_irqoff对应的vmx_handle_exit_irqoff,代码如下:
static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
if (vmx->exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
handle_external_interrupt_irqoff(vcpu);
else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)
handle_exception_nmi_irqoff(vmx);
}
static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
{
unsigned int vector;
unsigned long entry;
#ifdef CONFIG_X86_64
unsigned long tmp;
#endif
gate_desc *desc;
u32 intr_info;
intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
if (WARN_ONCE(!is_external_intr(intr_info),
"KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
return;
vector = intr_info & INTR_INFO_VECTOR_MASK;
desc = (gate_desc *)host_idt_base + vector;
entry = gate_offset(desc);
kvm_before_interrupt(vcpu);
asm volatile(
#ifdef CONFIG_X86_64
"mov %%" _ASM_SP ", %[sp]\n\t"
"and $0xfffffffffffffff0, %%" _ASM_SP "\n\t"
"push $%c[ss]\n\t"
"push %[sp]\n\t"
#endif
"pushf\n\t"
__ASM_SIZE(push) " $%c[cs]\n\t"
CALL_NOSPEC
:
#ifdef CONFIG_X86_64
[sp]"=&r"(tmp),
#endif
ASM_CALL_CONSTRAINT
:
THUNK_TARGET(entry),
[ss]"i"(__KERNEL_DS),
[cs]"i"(__KERNEL_CS)
);
kvm_after_interrupt(vcpu);
}
接下来阐述下handle_exit,对应的vmx函数是vmx_handle_exit。vmx_handle_exit是退出事件的总分发处理函数,在对一些特殊处理进行判断之后根据原因调用kvm_vmx_exit_handlers中定义的分发函数。
static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_EXCEPTION_NMI] = handle_exception_nmi,
[EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
[EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
[EXIT_REASON_NMI_WINDOW] = handle_nmi_window,
[EXIT_REASON_IO_INSTRUCTION] = handle_io,
[EXIT_REASON_CR_ACCESS] = handle_cr,
[EXIT_REASON_DR_ACCESS] = handle_dr,
[EXIT_REASON_CPUID] = kvm_emulate_cpuid,
[EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr,
[EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr,
[EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
[EXIT_REASON_HLT] = kvm_emulate_halt,
[EXIT_REASON_INVD] = handle_invd,
[EXIT_REASON_INVLPG] = handle_invlpg,
[EXIT_REASON_RDPMC] = handle_rdpmc,
[EXIT_REASON_VMCALL] = handle_vmcall,
[EXIT_REASON_VMCLEAR] = handle_vmx_instruction,
[EXIT_REASON_VMLAUNCH] = handle_vmx_instruction,
[EXIT_REASON_VMPTRLD] = handle_vmx_instruction,
[EXIT_REASON_VMPTRST] = handle_vmx_instruction,
......
};
这里面的EXIT_REASON_XXX定义了退出的原因,对应的handle_xxx定义了相应的处理函数,有的退出事件KVM能够自己处理,这个时候就直接处理并返回,准备下一轮的VCPU运行,如果KVM无法处理,如无法处理对应设备的PIO和MMIO端口的读写,则需要将时间分发到QEMU进行处理。
例如EXIT_REASON_CPUID,对应的处理函数是kvm_emulate_cpuid,这里主要是处理虚拟机的cpuid指令。从代码可以看出,CPUID的处理是查询之前QEMU的设置,并直接返回数据。
int kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
{
u32 eax, ebx, ecx, edx;
if (cpuid_fault_enabled(vcpu) && !kvm_require_cpl(vcpu, 0))
return 1;
eax = kvm_rax_read(vcpu);
ecx = kvm_rcx_read(vcpu);
kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, true);
kvm_rax_write(vcpu, eax);
kvm_rbx_write(vcpu, ebx);
kvm_rcx_write(vcpu, ecx);
kvm_rdx_write(vcpu, edx);
return kvm_skip_emulated_instruction(vcpu);
}
bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
u32 *ecx, u32 *edx, bool check_limit)
{
u32 function = *eax, index = *ecx;
struct kvm_cpuid_entry2 *entry;
struct kvm_cpuid_entry2 *max;
bool found;
entry = kvm_find_cpuid_entry(vcpu, function, index);
found = entry;
if (!entry && check_limit && !guest_cpuid_is_amd(vcpu) &&
!cpuid_function_in_range(vcpu, function)) {
max = kvm_find_cpuid_entry(vcpu, 0, 0);
if (max) {
function = max->eax;
entry = kvm_find_cpuid_entry(vcpu, function, index);
}
}
if (entry) {
*eax = entry->eax;
*ebx = entry->ebx;
*ecx = entry->ecx;
*edx = entry->edx;
if (function == 7 && index == 0) {
u64 data;
if (!__kvm_get_msr(vcpu, MSR_IA32_TSX_CTRL, &data, true) &&
(data & TSX_CTRL_CPUID_CLEAR))
*ebx &= ~(F(RTM) | F(HLE));
}
} else {
*eax = *ebx = *ecx = *edx = 0;
if (function == 0xb || function == 0x1f) {
entry = kvm_find_cpuid_entry(vcpu, function, 1);
if (entry) {
*ecx = index & 0xff;
*edx = entry->edx;
}
}
}
trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx, found);
return found;
}
上面的例子是KVM直接处理并返回的,而有些是KVM无法处理返回的,这个时候需要交个QEMU进行处理,例如IO指令。EXIT_REASON_IO_INSTRUCTION对应的处理函数是handle_io。
static int handle_io(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification;
int size, in, string;
unsigned port;
exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
string = (exit_qualification & 16) != 0;
++vcpu->stat.io_exits;
if (string)
return kvm_emulate_instruction(vcpu, 0);
port = exit_qualification >> 16;
size = (exit_qualification & 7) + 1;
in = (exit_qualification & 8) != 0;
return kvm_fast_pio(vcpu, size, port, in);
}
int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in)
{
int ret;
if (in)
ret = kvm_fast_pio_in(vcpu, size, port);
else
ret = kvm_fast_pio_out(vcpu, size, port);
return ret && kvm_skip_emulated_instruction(vcpu);
}
static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
unsigned short port)
{
unsigned long val = kvm_rax_read(vcpu);
int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
size, port, &val, 1);
if (ret)
return ret;
/*
* Workaround userspace that relies on old KVM behavior of %rip being
* incremented prior to exiting to userspace to handle "OUT 0x7e".
*/
if (port == 0x7e &&
kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_OUT_7E_INC_RIP)) {
vcpu->arch.complete_userspace_io =
complete_fast_pio_out_port_0x7e;
kvm_skip_emulated_instruction(vcpu);
} else {
vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
vcpu->arch.complete_userspace_io = complete_fast_pio_out;
}
return 0;
}
static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
int size, unsigned short port,
const void *val, unsigned int count)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
memcpy(vcpu->arch.pio_data, val, size * count);
trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data);
return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
}
static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
unsigned short port, void *val,
unsigned int count, bool in)
{
vcpu->arch.pio.port = port;
vcpu->arch.pio.in = in;
vcpu->arch.pio.count = count;
vcpu->arch.pio.size = size;
if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
vcpu->arch.pio.count = 0;
return 1;
}
vcpu->run->exit_reason = KVM_EXIT_IO;
vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
vcpu->run->io.size = size;
vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
vcpu->run->io.count = count;
vcpu->run->io.port = port;
return 0;
}
vcpu_enter_guest的返回值小于等于0,会导致该函数退出for循环,进行使该ioclt返回到用户态。
以上过程是VCPU执行一轮虚拟机代码的过程,即VM_EXIT和VM_ENTRY的过程。