上篇文章kvm_create_vm函数会创建一台虚拟机,用户态可以通过ioctl系统调用访问该虚拟机,同样对于cpu,kvm也提供了类似的方式实现创建和访问,对应的函数是kvm_vm_ioctl
static struct file_operations kvm_vm_fops = {
.release = kvm_vm_release,
.unlocked_ioctl = kvm_vm_ioctl,
.llseek = noop_llseek,
KVM_COMPAT(kvm_vm_compat_ioctl),
};
用户态进程qemu通过系统调用ioctl访问kvm_vm_ioctl函数
static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id)
{
struct KVMParkedVcpu *cpu;
QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
if (cpu->vcpu_id == vcpu_id) {
int kvm_fd;
QLIST_REMOVE(cpu, node);
kvm_fd = cpu->kvm_fd;
g_free(cpu);
return kvm_fd;
}
}
return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id);
}
int kvm_vm_ioctl(KVMState *s, int type, ...)
{
int ret;
void *arg;
va_list ap;
va_start(ap, type);
arg = va_arg(ap, void *);
va_end(ap);
trace_kvm_vm_ioctl(type, arg);
ret = ioctl(s->vmfd, type, arg);
if (ret == -1) {
ret = -errno;
}
return ret;
}
其中 kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id);通过传入KVM_CREATE_VPU标识表示是创建VCPU的动作,并且参数vcpu_id是用户态下的apic_id,即VCPU索引号
static long kvm_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
struct kvm *kvm = filp->private_data;
void __user *argp = (void __user *)arg;
int r;
if (kvm->mm != current->mm)
return -EIO;
switch (ioctl) {
case KVM_CREATE_VCPU:
r = kvm_vm_ioctl_create_vcpu(kvm, arg);
break;
........
}
创建vcpu函数kvm_vm_ioctl_create_vcpu,函数如下
static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
{
int r;
struct kvm_vcpu *vcpu;
if (id >= KVM_MAX_VCPU_ID)
return -EINVAL;
mutex_lock(&kvm->lock);
if (kvm->created_vcpus == KVM_MAX_VCPUS) {
mutex_unlock(&kvm->lock);
return -EINVAL;
}
kvm->created_vcpus++;
mutex_unlock(&kvm->lock);
vcpu = kvm_arch_vcpu_create(kvm, id);
if (IS_ERR(vcpu)) {
r = PTR_ERR(vcpu);
goto vcpu_decrement;
}
preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
r = kvm_arch_vcpu_setup(vcpu);
if (r)
goto vcpu_destroy;
kvm_create_vcpu_debugfs(vcpu);
mutex_lock(&kvm->lock);
if (kvm_get_vcpu_by_id(kvm, id)) {
r = -EEXIST;
goto unlock_vcpu_destroy;
}
vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
BUG_ON(kvm->vcpus[vcpu->vcpu_idx]);
/* Now it's all set up, let userspace reach it */
kvm_get_kvm(kvm);
r = create_vcpu_fd(vcpu);
if (r < 0) {
kvm_put_kvm_no_destroy(kvm);
goto unlock_vcpu_destroy;
}
kvm->vcpus[vcpu->vcpu_idx] = vcpu;
/*
* Pairs with smp_rmb() in kvm_get_vcpu. Write kvm->vcpus
* before kvm->online_vcpu's incremented value.
*/
smp_wmb();
atomic_inc(&kvm->online_vcpus);
mutex_unlock(&kvm->lock);
kvm_arch_vcpu_postcreate(vcpu);
return r;
unlock_vcpu_destroy:
mutex_unlock(&kvm->lock);
debugfs_remove_recursive(vcpu->debugfs_dentry);
vcpu_destroy:
kvm_arch_vcpu_destroy(vcpu);
vcpu_decrement:
mutex_lock(&kvm->lock);
kvm->created_vcpus--;
mutex_unlock(&kvm->lock);
return r;
}
kvm_vm_ioctl_create_vcpu首先会判断vcpu索引,即创建的vcpu编号是不是有效的,即有没有超过最大VCPU编号,目前从内核5.x内核了解到最大vcpu数为288。所以超过了该数目则直接报错。然后调用kvm_arch_vcpu_create,由于是intel,所以调用vmx的回调函数。
struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
unsigned int id)
{
struct kvm_vcpu *vcpu;
if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
printk_once(KERN_WARNING
"kvm: SMP vm created on host with unstable TSC; "
"guest TSC will not be reliable\n");
vcpu = kvm_x86_ops->vcpu_create(kvm, id);
return vcpu;
}
vcpu_create函数是个回调函数,这里以intel为例,因此调用的函数是vmx_create_vcpu,所在文件是arch/x86/kvm/vmx/vmx.c
static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
{
int err;
struct vcpu_vmx *vmx;
unsigned long *msr_bitmap;
int i, cpu;
BUILD_BUG_ON_MSG(offsetof(struct vcpu_vmx, vcpu) != 0,
"struct kvm_vcpu must be at offset 0 for arch usercopy region");
vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
if (!vmx)
return ERR_PTR(-ENOMEM);
vmx->vcpu.arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache,
GFP_KERNEL_ACCOUNT);
if (!vmx->vcpu.arch.user_fpu) {
printk(KERN_ERR "kvm: failed to allocate kvm userspace's fpu\n");
err = -ENOMEM;
goto free_partial_vcpu;
}
vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
GFP_KERNEL_ACCOUNT);
if (!vmx->vcpu.arch.guest_fpu) {
printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n");
err = -ENOMEM;
goto free_user_fpu;
}
vmx->vpid = allocate_vpid();
err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
if (err)
goto free_vcpu;
err = -ENOMEM;
}
vmx的VCPU用结构vcpu_vmx表示,vcpu_vmx中第一个成员是类型为kvm_vcpu的vcpu变量,表示的是通用层面的VCPU。vmx_create_vcpu函数从kvm_vcpu_cache缓存中分配一个vcpu_vmx结构表示VCPU,这个缓存是在kmv_init中初始化的。allocate_vpid为该CPU分配一个vpid,vpid用于开启EPT当前情况,每个VPUC与一个VPID关联,这样在进行VCPU的切换时就可以不必将tlb中的数据全部冲洗掉了,从而提高性能。
vmx_create_vcpu函数紧接着调用kvm_vcpu_init函数对VCPU的通用部分进行初始化,此时的参数是vcpu_vmx的kvm_vcpu成员。kvm_vcpu_init对kvm_vcpu进行初始化,在其中会调用kvm_arch_vcpu_init对kvm_vcpu架构相关成员kvm_vcpu_arch进行初始化,kvm_arch_vcpu_init还会进行创建local APIC中断控制器、创建MMU等跟架构密切相关的工作。
如果使能enable_pml,则分配pml使用的页。pml是page modification logging的缩写,用于在硬件层面记录虚拟机中访问过的物理页面,能够快速实现标记脏页.
vmx_create_vcpu接着分配保存虚拟机VCPU的msr寄存器空间。设置VCPU的loaded_vmcs成员执行另一个vmcs01成员。loaded_vmcs指向的是当前VCPU对应的VMCS区域,这里的vmcs01表示的是普通虚拟化,也就是没有嵌套的虚拟化,如果是嵌套虚拟化,该指针指向其他值。调用alloc_vmcs为该VCPU分配VMCS区域,其大小是由全局变量vmcs_config.order决定的。调用loaded_vmcs_init初始化刚刚分配的VMCS区域。
vmx_vpu_load函数将per CPU变量current_vmcs设置成刚刚分配的VMCS,然后调用vmptrld指令。init_vmcs函数用来初始化当前物理CPU对应的VMCS结构。
以上就是vmx_create_vcpu的主要核心工作内容,可以看到,vmx_create_vcpu分配了一个代表VCPU结构的vcpu_vmx结构,然后分配对应的VMCS结构与当前物理CPU绑定对应VMCS进行初始化。 除了vmx_create_vcpu之外,还有一个函数比较重要,那就是kvm_arch_vcpu_setup,函数代码如下
int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
{
vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
kvm_vcpu_mtrr_init(vcpu);
vcpu_load(vcpu);
kvm_vcpu_reset(vcpu, false);
kvm_init_mmu(vcpu, false);
vcpu_put(vcpu);
return 0;
}
void vcpu_load(struct kvm_vcpu *vcpu)
{
int cpu = get_cpu();
preempt_notifier_register(&vcpu->preempt_notifier);
kvm_arch_vcpu_load(vcpu, cpu);
put_cpu();
}
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
/* Address WBINVD may be executed by guest */
if (need_emulate_wbinvd(vcpu)) {
if (kvm_x86_ops->has_wbinvd_exit())
cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
smp_call_function_single(vcpu->cpu,
wbinvd_ipi, NULL, 1);
}
kvm_x86_ops->vcpu_load(vcpu, cpu);
.........
}
vcpu_load函数会禁止抢断的情况下注册VCPU的preempt_notifier,调用kvm_arch_vcpu_load,kvm_arch_vcpu_load的主要作用就是调用vmx实现的vcpu_load函数,即vmx_vcpu_load完成当前物理CPU与该VCPU的VMCS进行绑定。
kvm_vcpu_reset函数对kvm_vcpu的成员进行初始化,不仅包括通用的VCPU结构,还包括架构相关的vcpu.arch成员。最后调用vmx的vcpu_reset回调vmx_vcpu_reset函数设置VMCS的相关域。
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
kvm_lapic_reset(vcpu, init_event);
vcpu->arch.hflags = 0;
vcpu->arch.smi_pending = 0;
vcpu->arch.smi_count = 0;
atomic_set(&vcpu->arch.nmi_queued, 0);
vcpu->arch.nmi_pending = 0;
vcpu->arch.nmi_injected = false;
kvm_clear_interrupt_queue(vcpu);
kvm_clear_exception_queue(vcpu);
vcpu->arch.exception.pending = false;
........
memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
vcpu->arch.regs_avail = ~0;
vcpu->arch.regs_dirty = ~0;
vcpu->arch.ia32_xss = 0;
kvm_x86_ops->vcpu_reset(vcpu, init_event);
}