VCPU的运行过程

VCPU的运行过程

Posted by lwk on May 29, 2022

今天主要阐述VCPU的执行过程,在整个cpu虚拟化过程中,VCPU的运行是核心,包括了vm entry和vm exit两个过程。在这两个过程中,又牵扯到虚拟cpu和物理cpu的切换过程。由于本文是以intel CPU虚拟化为例,因此涉及到的代码都会以VMX为主。

先来看下VCPU运行的代码开始部分,这部分代码主要在回调函数kvm_vcpu_thread_fn中。


static void *kvm_vcpu_thread_fn(void *arg)
{
    CPUState *cpu = arg;
    int r;

    rcu_register_thread();

    qemu_mutex_lock_iothread();
    qemu_thread_get_self(cpu->thread);
    cpu->thread_id = qemu_get_thread_id();
    cpu->can_do_io = 1;
    current_cpu = cpu;

    r = kvm_init_vcpu(cpu, &error_fatal);
    kvm_init_cpu_signals(cpu);

    /* signal CPU creation */
    cpu_thread_signal_created(cpu);
    qemu_guest_random_seed_thread_part2(cpu->random_seed);

    do {
        if (cpu_can_run(cpu)) {
            r = kvm_cpu_exec(cpu);
                cpu_handle_guest_debug(cpu);
            }
        }
        qemu_wait_io_event(cpu);
    } while (!cpu->unplug || cpu_can_run(cpu));

    kvm_destroy_vcpu(cpu);
    cpu_thread_signal_destroyed(cpu);
    qemu_mutex_unlock_iothread();
    rcu_unregister_thread();
    return NULL;
}

关于kvm_vcpu_thread_fn函数,在前篇文章都有阐述,主要是虚拟机初始化过程中(qemu 启动)创建VCPU时(主要是创建qemu线程)执行的回调函数。在该函数的内部有for循环,执行虚拟机的代码。从代码可以看到主要是kvm_cpu_exec函数。

int kvm_cpu_exec(CPUState *cpu)
{   
    struct kvm_run *run = cpu->kvm_run;
    int ret, run_ret;
    
    DPRINTF("kvm_cpu_exec()\n");

    if (kvm_arch_process_async_events(cpu)) {
        qatomic_set(&cpu->exit_request, 0);
        return EXCP_HLT;
    }
    
    qemu_mutex_unlock_iothread();
    cpu_exec_start(cpu);
            
    do {
.......
        kvm_arch_pre_run(cpu, run);
        run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0);

        attrs = kvm_arch_post_run(cpu, run);
        switch (run->exit_reason) {
        case KVM_EXIT_IO:
            DPRINTF("handle_io\n");
            /* Called outside BQL */
            kvm_handle_io(run->io.port, attrs,
                          (uint8_t *)run + run->io.data_offset,
                          run->io.direction,
                          run->io.size,
                          run->io.count);
            ret = 0;
            break;
        case KVM_EXIT_MMIO:
            DPRINTF("handle_mmio\n");
            /* Called outside BQL */
            address_space_rw(&address_space_memory,
                             run->mmio.phys_addr, attrs,
                             run->mmio.data,
                             run->mmio.len,
                             run->mmio.is_write);
            ret = 0;
            break;
        case KVM_EXIT_IRQ_WINDOW_OPEN:        
.......
        default:
            DPRINTF("kvm_arch_handle_exit\n");
            ret = kvm_arch_handle_exit(cpu, run);
            break;
        }
    } while (ret == 0);
......
}

kvm_arch_pre_run函数首先做一些运行前的准备工作,主要是nmi和smi的中断注入,之后触发VCPU的系统调用ioctl(KVM_RUN)让cpu运行起来,KVM模块收到ioctl(KVM_RUN)指令之后会执行VMX指令,把该VCPU的运行的物理CPU从VMX ROOT模式切换到VMX non-root模式,开始运行虚拟机的代码。虚拟机内部如果遇到一些事件产生VM exit,就会退出到KVM,如果KVM无法处理就会分发到QEMU,也就是在ioctl(KVM_RUN)返回的时候调用kvm_arch_post_run来进行一些初步处理,然后开始根据QEMU与KVM共享内存kvm_run中的数据来判断退出原因,并作出相应处理。

这里给出QEMU/KVM/以及虚拟机之间的关系图,如下图所示: image

接下来分析ioctl(KVM_RUN)在内核中的运行过程,对应的处理函数是kvm_arch_vcpu_ioctl_run,该函数主要调用vcpu_run,代码如下:


static int vcpu_run(struct kvm_vcpu *vcpu)
{
    int r;
    struct kvm *kvm = vcpu->kvm;

    vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
    vcpu->arch.l1tf_flush_l1d = true;

    for (;;) {
        if (kvm_vcpu_running(vcpu)) {
            r = vcpu_enter_guest(vcpu);
        } else {
            r = vcpu_block(kvm, vcpu);
        }

        if (r <= 0)
            break;

        kvm_clear_request(KVM_REQ_PENDING_TIMER, vcpu);
        if (kvm_cpu_has_pending_timer(vcpu))
            kvm_inject_pending_timer_irqs(vcpu);

        if (dm_request_for_irq_injection(vcpu) &&
            kvm_vcpu_ready_for_interrupt_injection(vcpu)) {
            r = 0;
            vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
            ++vcpu->stat.request_irq_exits;
            break;
        }

        kvm_check_async_pf_completion(vcpu);

        if (signal_pending(current)) {
            r = -EINTR;
            vcpu->run->exit_reason = KVM_EXIT_INTR;
            ++vcpu->stat.signal_exits;
            break;
        }
        if (need_resched()) {
            srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
            cond_resched();
            vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
        }
    }

    srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);

    return r;
}

vcpu_run函数主要也是一个循环结构。首先调用kvm_vcpu_running判断当前CPU是否可以运行,代码如下:

static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
{
    if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
        kvm_x86_ops->check_nested_events(vcpu, false);

    return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
        !vcpu->arch.apf.halted);
}

判断主要是2个方面,1是vcpu.arch结构的mp_state是否是KVM_MP_STATE_RUNNABLE,2是vcpu.arch结构中的apf.halted表示的虚拟机中是否存在需要访问却被宿主机swap出去的内存页,如果由于apf而被暂停,则这个时候虚拟机CPU也是不能运行的。

如果判断结果是可运行的,则会调用vcpu_enter_guest来进入虚拟机,在vcpu_enter_guest的最开始,该函数会对vcpu->requests上的请求进行处理,这个requests为unsigned longleixing ,每一位代表一个请求。

vcpu_enter_guest接下来处理虚拟中断请求,调用kvm_mmu_reload重启加载虚拟机cpu对应的内存。在禁止抢占之后调用回调函数prepare_guest_switch,vmx对应的函数是vmx_save_host_state,主要是保存host的状态,通过vmcs_writel将宿主机的状态保存在VMCS区域中,使虚拟机退出之后能够正常运行。

紧接着调用vmx的run函数,对应函数是vmx_vcpu_run。该函数首先根据VCPU的状态写一些VMCS的值,接着执行汇编ASM_VMX_VMLAUNCH(汇编代码在文件 arch/x86/kvm/vmx/vmenter.S中,本文不贴具体代码了)将CPU置于guest模式,这个时候cpu就开始执行虚拟机的代码,当发生退出时候,其地址是vmx_return,这个值是在vmx_vcpu_setup调用的vmx_set_constant_host_state函数中设置的。


static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
......
    .run = vmx_vcpu_run,
    .handle_exit = vmx_handle_exit,
......
}

vmx_vcpu_run接着调用vmcs_read32读出虚拟机退出的原因,保存在vcpu_vmx结构体的exit_reason成员中。vmx_vcpu_run调用3个函数对本次退出进行预处理,分别是vmx_complete_atomic_exit(vmx)、vmx_recover_nmi_blocking(vmx)和vmx_complete_interrrupts(vmx)。

vmx_vcpu_run代码如下:

static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
{   
    struct vcpu_vmx *vmx = to_vmx(vcpu);
    unsigned long cr3, cr4; 
    
    /* Record the guest's net vcpu time for enforced NMI injections. */
    if (unlikely(!enable_vnmi &&
             vmx->loaded_vmcs->soft_vnmi_blocked))
        vmx->loaded_vmcs->entry_time = ktime_get();
......
    cr4 = cr4_read_shadow();
    if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
        vmcs_writel(HOST_CR4, cr4);
        vmx->loaded_vmcs->host_state.cr4 = cr4;
    }
........
    vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON);
    if ((u16)vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
        kvm_machine_check();

    if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
        return;

    vmx->loaded_vmcs->launched = 1; 
    vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);

    vmx_recover_nmi_blocking(vmx);
    vmx_complete_interrupts(vmx);
}

回到vcpu_enter_guest,当虚拟机退出之后会调用vmx实现的handle_exit_irqoff回调来处理外部中断,并调用handle_exit回调来处理各种退出事件。先看handle_exit_irqoff对应的vmx_handle_exit_irqoff,代码如下:

static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
{
    struct vcpu_vmx *vmx = to_vmx(vcpu);

    if (vmx->exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
        handle_external_interrupt_irqoff(vcpu);
    else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)
        handle_exception_nmi_irqoff(vmx);
}
static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
{
    unsigned int vector; 
    unsigned long entry;
#ifdef CONFIG_X86_64
    unsigned long tmp;
#endif
    gate_desc *desc;
    u32 intr_info;

    intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
    if (WARN_ONCE(!is_external_intr(intr_info),
        "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
        return;
        
    vector = intr_info & INTR_INFO_VECTOR_MASK;
    desc = (gate_desc *)host_idt_base + vector;
    entry = gate_offset(desc);

    kvm_before_interrupt(vcpu);

    asm volatile(
#ifdef CONFIG_X86_64
        "mov %%" _ASM_SP ", %[sp]\n\t"
        "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t"
        "push $%c[ss]\n\t"
        "push %[sp]\n\t"
#endif
        "pushf\n\t"
        __ASM_SIZE(push) " $%c[cs]\n\t"
        CALL_NOSPEC
        :
#ifdef CONFIG_X86_64
        [sp]"=&r"(tmp),
#endif
        ASM_CALL_CONSTRAINT
        :
        THUNK_TARGET(entry),
        [ss]"i"(__KERNEL_DS),
        [cs]"i"(__KERNEL_CS)
    );

    kvm_after_interrupt(vcpu);
}

接下来阐述下handle_exit,对应的vmx函数是vmx_handle_exit。vmx_handle_exit是退出事件的总分发处理函数,在对一些特殊处理进行判断之后根据原因调用kvm_vmx_exit_handlers中定义的分发函数。

static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
    [EXIT_REASON_EXCEPTION_NMI]           = handle_exception_nmi,
    [EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
    [EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
    [EXIT_REASON_NMI_WINDOW]          = handle_nmi_window,
    [EXIT_REASON_IO_INSTRUCTION]          = handle_io,
    [EXIT_REASON_CR_ACCESS]               = handle_cr,
    [EXIT_REASON_DR_ACCESS]               = handle_dr,
    [EXIT_REASON_CPUID]                   = kvm_emulate_cpuid,
    [EXIT_REASON_MSR_READ]                = kvm_emulate_rdmsr,
    [EXIT_REASON_MSR_WRITE]               = kvm_emulate_wrmsr,
    [EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
    [EXIT_REASON_HLT]                     = kvm_emulate_halt,
    [EXIT_REASON_INVD]            = handle_invd,
    [EXIT_REASON_INVLPG]              = handle_invlpg,
    [EXIT_REASON_RDPMC]                   = handle_rdpmc,
    [EXIT_REASON_VMCALL]                  = handle_vmcall,
    [EXIT_REASON_VMCLEAR]             = handle_vmx_instruction,
    [EXIT_REASON_VMLAUNCH]            = handle_vmx_instruction,
    [EXIT_REASON_VMPTRLD]             = handle_vmx_instruction,
    [EXIT_REASON_VMPTRST]             = handle_vmx_instruction,
......
};

这里面的EXIT_REASON_XXX定义了退出的原因,对应的handle_xxx定义了相应的处理函数,有的退出事件KVM能够自己处理,这个时候就直接处理并返回,准备下一轮的VCPU运行,如果KVM无法处理,如无法处理对应设备的PIO和MMIO端口的读写,则需要将时间分发到QEMU进行处理。

例如EXIT_REASON_CPUID,对应的处理函数是kvm_emulate_cpuid,这里主要是处理虚拟机的cpuid指令。从代码可以看出,CPUID的处理是查询之前QEMU的设置,并直接返回数据。

int kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
{   
    u32 eax, ebx, ecx, edx;               
    
    if (cpuid_fault_enabled(vcpu) && !kvm_require_cpl(vcpu, 0))
        return 1;
    
    eax = kvm_rax_read(vcpu);     
    ecx = kvm_rcx_read(vcpu);         
    kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, true);
    kvm_rax_write(vcpu, eax);             
    kvm_rbx_write(vcpu, ebx);         
    kvm_rcx_write(vcpu, ecx);         
    kvm_rdx_write(vcpu, edx);         
    return kvm_skip_emulated_instruction(vcpu);
}   
bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
           u32 *ecx, u32 *edx, bool check_limit)
{
    u32 function = *eax, index = *ecx;
    struct kvm_cpuid_entry2 *entry;
    struct kvm_cpuid_entry2 *max;
    bool found;

    entry = kvm_find_cpuid_entry(vcpu, function, index);
    found = entry;
    if (!entry && check_limit && !guest_cpuid_is_amd(vcpu) &&
        !cpuid_function_in_range(vcpu, function)) {
        max = kvm_find_cpuid_entry(vcpu, 0, 0);
        if (max) {
            function = max->eax;
            entry = kvm_find_cpuid_entry(vcpu, function, index);
        }
    }
    if (entry) {
        *eax = entry->eax;
        *ebx = entry->ebx;
        *ecx = entry->ecx;
        *edx = entry->edx;
        if (function == 7 && index == 0) {
            u64 data;
                if (!__kvm_get_msr(vcpu, MSR_IA32_TSX_CTRL, &data, true) &&
                (data & TSX_CTRL_CPUID_CLEAR))
                *ebx &= ~(F(RTM) | F(HLE));
        }
    } else {
        *eax = *ebx = *ecx = *edx = 0;
        if (function == 0xb || function == 0x1f) {
            entry = kvm_find_cpuid_entry(vcpu, function, 1);
            if (entry) {
                *ecx = index & 0xff;
                *edx = entry->edx;
            }
        }
    }
    trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx, found);
    return found;
}

上面的例子是KVM直接处理并返回的,而有些是KVM无法处理返回的,这个时候需要交个QEMU进行处理,例如IO指令。EXIT_REASON_IO_INSTRUCTION对应的处理函数是handle_io。

static int handle_io(struct kvm_vcpu *vcpu) 
{   
    unsigned long exit_qualification;     
    int size, in, string;
    unsigned port;
    
    exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
    string = (exit_qualification & 16) != 0;
    
    ++vcpu->stat.io_exits;            
    
    if (string)
        return kvm_emulate_instruction(vcpu, 0);
    
    port = exit_qualification >> 16;  
    size = (exit_qualification & 7) + 1;
    in = (exit_qualification & 8) != 0;

    return kvm_fast_pio(vcpu, size, port, in);
}
int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in)
{
    int ret;

    if (in)
        ret = kvm_fast_pio_in(vcpu, size, port);
    else  
        ret = kvm_fast_pio_out(vcpu, size, port);
    return ret && kvm_skip_emulated_instruction(vcpu);
}
static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
                unsigned short port)
{   
    unsigned long val = kvm_rax_read(vcpu);
    int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
                        size, port, &val, 1);
    if (ret)
        return ret;
    
    /*
     * Workaround userspace that relies on old KVM behavior of %rip being
     * incremented prior to exiting to userspace to handle "OUT 0x7e".
     */
    if (port == 0x7e &&
        kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_OUT_7E_INC_RIP)) {
        vcpu->arch.complete_userspace_io =
            complete_fast_pio_out_port_0x7e;
        kvm_skip_emulated_instruction(vcpu);
    } else {
        vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
        vcpu->arch.complete_userspace_io = complete_fast_pio_out;
    }
    return 0;
}
static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
                     int size, unsigned short port,
                     const void *val, unsigned int count)
{   
    struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
        
    memcpy(vcpu->arch.pio_data, val, size * count);
    trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data);
    return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
}    

static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
                   unsigned short port, void *val,
                   unsigned int count, bool in)
{   
    vcpu->arch.pio.port = port;
    vcpu->arch.pio.in = in;
    vcpu->arch.pio.count  = count;
    vcpu->arch.pio.size = size;
    
    if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
        vcpu->arch.pio.count = 0;
        return 1;
    }
    
    vcpu->run->exit_reason = KVM_EXIT_IO;
    vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
    vcpu->run->io.size = size;
    vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
    vcpu->run->io.count = count;
    vcpu->run->io.port = port;

    return 0;
}

vcpu_enter_guest的返回值小于等于0,会导致该函数退出for循环,进行使该ioclt返回到用户态。

以上过程是VCPU执行一轮虚拟机代码的过程,即VM_EXIT和VM_ENTRY的过程。