KVM中VCPU的创建流程

KVM中VCPU的创建流程

Posted by lwk on May 24, 2022

上篇文章kvm_create_vm函数会创建一台虚拟机,用户态可以通过ioctl系统调用访问该虚拟机,同样对于cpu,kvm也提供了类似的方式实现创建和访问,对应的函数是kvm_vm_ioctl


static struct file_operations kvm_vm_fops = {
    .release        = kvm_vm_release,
    .unlocked_ioctl = kvm_vm_ioctl,
    .llseek     = noop_llseek,
    KVM_COMPAT(kvm_vm_compat_ioctl),
};

用户态进程qemu通过系统调用ioctl访问kvm_vm_ioctl函数

static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id)
{
    struct KVMParkedVcpu *cpu;

    QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
        if (cpu->vcpu_id == vcpu_id) {
            int kvm_fd;

            QLIST_REMOVE(cpu, node);
            kvm_fd = cpu->kvm_fd;
            g_free(cpu);
            return kvm_fd;
        }
    }

    return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id);
}
int kvm_vm_ioctl(KVMState *s, int type, ...)
{   
    int ret;
    void *arg;
    va_list ap;

    va_start(ap, type);
    arg = va_arg(ap, void *);
    va_end(ap); 
    
    trace_kvm_vm_ioctl(type, arg);
    ret = ioctl(s->vmfd, type, arg);
    if (ret == -1) {
        ret = -errno;
    }
    return ret;
}

其中 kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id);通过传入KVM_CREATE_VPU标识表示是创建VCPU的动作,并且参数vcpu_id是用户态下的apic_id,即VCPU索引号


static long kvm_vm_ioctl(struct file *filp,
               unsigned int ioctl, unsigned long arg)
{   
    struct kvm *kvm = filp->private_data;
    void __user *argp = (void __user *)arg;
    int r;
        
    if (kvm->mm != current->mm)
        return -EIO;
    switch (ioctl) {
    case KVM_CREATE_VCPU:
        r = kvm_vm_ioctl_create_vcpu(kvm, arg);
        break;
........
}

创建vcpu函数kvm_vm_ioctl_create_vcpu,函数如下

static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
{   
    int r;
    struct kvm_vcpu *vcpu;
    
    if (id >= KVM_MAX_VCPU_ID)
        return -EINVAL;
        
    mutex_lock(&kvm->lock);
    if (kvm->created_vcpus == KVM_MAX_VCPUS) {
        mutex_unlock(&kvm->lock);
        return -EINVAL;
    }
    
    kvm->created_vcpus++;
    mutex_unlock(&kvm->lock);

    vcpu = kvm_arch_vcpu_create(kvm, id);
    if (IS_ERR(vcpu)) {
        r = PTR_ERR(vcpu);
        goto vcpu_decrement;
    }

    preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);

    r = kvm_arch_vcpu_setup(vcpu);
    if (r)
        goto vcpu_destroy;

    kvm_create_vcpu_debugfs(vcpu);

    mutex_lock(&kvm->lock);
    if (kvm_get_vcpu_by_id(kvm, id)) {
        r = -EEXIST;
        goto unlock_vcpu_destroy;
    }

    vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
    BUG_ON(kvm->vcpus[vcpu->vcpu_idx]);

    /* Now it's all set up, let userspace reach it */
    kvm_get_kvm(kvm);
    r = create_vcpu_fd(vcpu);
    if (r < 0) {
        kvm_put_kvm_no_destroy(kvm);
        goto unlock_vcpu_destroy;
    }

    kvm->vcpus[vcpu->vcpu_idx] = vcpu;
    /*
     * Pairs with smp_rmb() in kvm_get_vcpu.  Write kvm->vcpus
     * before kvm->online_vcpu's incremented value.
     */
    smp_wmb();
    atomic_inc(&kvm->online_vcpus);

    mutex_unlock(&kvm->lock);
    kvm_arch_vcpu_postcreate(vcpu);
    return r;

unlock_vcpu_destroy:
    mutex_unlock(&kvm->lock);
    debugfs_remove_recursive(vcpu->debugfs_dentry);
vcpu_destroy:
    kvm_arch_vcpu_destroy(vcpu);
vcpu_decrement:
    mutex_lock(&kvm->lock);
    kvm->created_vcpus--;
    mutex_unlock(&kvm->lock);
    return r;
}

kvm_vm_ioctl_create_vcpu首先会判断vcpu索引,即创建的vcpu编号是不是有效的,即有没有超过最大VCPU编号,目前从内核5.x内核了解到最大vcpu数为288。所以超过了该数目则直接报错。然后调用kvm_arch_vcpu_create,由于是intel,所以调用vmx的回调函数。

struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, 
                        unsigned int id)
{
    struct kvm_vcpu *vcpu;

    if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) 
        printk_once(KERN_WARNING
        "kvm: SMP vm created on host with unstable TSC; "
        "guest TSC will not be reliable\n");

    vcpu = kvm_x86_ops->vcpu_create(kvm, id);

    return vcpu; 
}

vcpu_create函数是个回调函数,这里以intel为例,因此调用的函数是vmx_create_vcpu,所在文件是arch/x86/kvm/vmx/vmx.c

static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
{
    int err; 
    struct vcpu_vmx *vmx;
    unsigned long *msr_bitmap;
    int i, cpu; 

    BUILD_BUG_ON_MSG(offsetof(struct vcpu_vmx, vcpu) != 0,
        "struct kvm_vcpu must be at offset 0 for arch usercopy region");

    vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
    if (!vmx)
        return ERR_PTR(-ENOMEM);

    vmx->vcpu.arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache,
            GFP_KERNEL_ACCOUNT);
    if (!vmx->vcpu.arch.user_fpu) {
        printk(KERN_ERR "kvm: failed to allocate kvm userspace's fpu\n");
        err = -ENOMEM;
        goto free_partial_vcpu;
    }
    vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
            GFP_KERNEL_ACCOUNT);
    if (!vmx->vcpu.arch.guest_fpu) {
        printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n");
        err = -ENOMEM;
        goto free_user_fpu;
    }

    vmx->vpid = allocate_vpid();

    err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
    if (err)
        goto free_vcpu;

    err = -ENOMEM;

}

vmx的VCPU用结构vcpu_vmx表示,vcpu_vmx中第一个成员是类型为kvm_vcpu的vcpu变量,表示的是通用层面的VCPU。vmx_create_vcpu函数从kvm_vcpu_cache缓存中分配一个vcpu_vmx结构表示VCPU,这个缓存是在kmv_init中初始化的。allocate_vpid为该CPU分配一个vpid,vpid用于开启EPT当前情况,每个VPUC与一个VPID关联,这样在进行VCPU的切换时就可以不必将tlb中的数据全部冲洗掉了,从而提高性能。

vmx_create_vcpu函数紧接着调用kvm_vcpu_init函数对VCPU的通用部分进行初始化,此时的参数是vcpu_vmx的kvm_vcpu成员。kvm_vcpu_init对kvm_vcpu进行初始化,在其中会调用kvm_arch_vcpu_init对kvm_vcpu架构相关成员kvm_vcpu_arch进行初始化,kvm_arch_vcpu_init还会进行创建local APIC中断控制器、创建MMU等跟架构密切相关的工作。

如果使能enable_pml,则分配pml使用的页。pml是page modification logging的缩写,用于在硬件层面记录虚拟机中访问过的物理页面,能够快速实现标记脏页.

vmx_create_vcpu接着分配保存虚拟机VCPU的msr寄存器空间。设置VCPU的loaded_vmcs成员执行另一个vmcs01成员。loaded_vmcs指向的是当前VCPU对应的VMCS区域,这里的vmcs01表示的是普通虚拟化,也就是没有嵌套的虚拟化,如果是嵌套虚拟化,该指针指向其他值。调用alloc_vmcs为该VCPU分配VMCS区域,其大小是由全局变量vmcs_config.order决定的。调用loaded_vmcs_init初始化刚刚分配的VMCS区域。

vmx_vpu_load函数将per CPU变量current_vmcs设置成刚刚分配的VMCS,然后调用vmptrld指令。init_vmcs函数用来初始化当前物理CPU对应的VMCS结构。

以上就是vmx_create_vcpu的主要核心工作内容,可以看到,vmx_create_vcpu分配了一个代表VCPU结构的vcpu_vmx结构,然后分配对应的VMCS结构与当前物理CPU绑定对应VMCS进行初始化。 除了vmx_create_vcpu之外,还有一个函数比较重要,那就是kvm_arch_vcpu_setup,函数代码如下


int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
{
    vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
    vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
    kvm_vcpu_mtrr_init(vcpu);
    vcpu_load(vcpu);
    kvm_vcpu_reset(vcpu, false);
    kvm_init_mmu(vcpu, false);
    vcpu_put(vcpu);
    return 0;
}

void vcpu_load(struct kvm_vcpu *vcpu)
{
    int cpu = get_cpu();
    preempt_notifier_register(&vcpu->preempt_notifier);
    kvm_arch_vcpu_load(vcpu, cpu);
    put_cpu();
}

void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{       
    /* Address WBINVD may be executed by guest */
    if (need_emulate_wbinvd(vcpu)) {
        if (kvm_x86_ops->has_wbinvd_exit())
            cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
        else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
            smp_call_function_single(vcpu->cpu,
                    wbinvd_ipi, NULL, 1);
    }

    kvm_x86_ops->vcpu_load(vcpu, cpu);
    
.........
}

vcpu_load函数会禁止抢断的情况下注册VCPU的preempt_notifier,调用kvm_arch_vcpu_load,kvm_arch_vcpu_load的主要作用就是调用vmx实现的vcpu_load函数,即vmx_vcpu_load完成当前物理CPU与该VCPU的VMCS进行绑定。

kvm_vcpu_reset函数对kvm_vcpu的成员进行初始化,不仅包括通用的VCPU结构,还包括架构相关的vcpu.arch成员。最后调用vmx的vcpu_reset回调vmx_vcpu_reset函数设置VMCS的相关域。

void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{   
    kvm_lapic_reset(vcpu, init_event);
    
    vcpu->arch.hflags = 0;

    vcpu->arch.smi_pending = 0;
    vcpu->arch.smi_count = 0;
    atomic_set(&vcpu->arch.nmi_queued, 0);
    vcpu->arch.nmi_pending = 0;
    vcpu->arch.nmi_injected = false;
    kvm_clear_interrupt_queue(vcpu);
    kvm_clear_exception_queue(vcpu);
    vcpu->arch.exception.pending = false;
........
    memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
    vcpu->arch.regs_avail = ~0;
    vcpu->arch.regs_dirty = ~0;

    vcpu->arch.ia32_xss = 0;

    kvm_x86_ops->vcpu_reset(vcpu, init_event);
}